1 Libraries

This workflow reanalyze the single nucleus RNA-seq data produced by (Habib et al. 2017), using DroNc-seq: massively parallel sNuc-seq with droplet technology.

We will first load a few libraries. Among them, * DropletUtils provides functions for data from droplet technologies such as 10X Genomics. * biomaRt provides easy access to databases, such as Ensembl, COSMIC, Uniprot, HGNC, etc. * scater is a collection of tools for doing quality control analyses of scRNA-seq * scran provide functions for normalization of cell-specific libary sizes, correcting batch effects, and identification marker genes

bio_packs = c("SingleCellExperiment","DropletUtils","biomaRt",
            "scater","scran","limma","org.Hs.eg.db","SC3")
source("https://bioconductor.org/biocLite.R")
for(pack1 in bio_packs){
    if( !pack1 %in% installed.packages()[,"Package"]){
        biocLite(pack1, suppressUpdates = TRUE)
    }
}

cran_packs = c("data.table","svd","Rtsne","stringi","irlba")
for(pack1 in cran_packs){
    if( !pack1 %in% installed.packages()[,"Package"]){
        install.packages(pack1)
    }
}

library(data.table)
library(SingleCellExperiment)
library(DropletUtils)
library(biomaRt)
library(scater)
library(scran)
library(limma)
library(ggplot2)

We setup different directories, and set num_cores improve SC3 runtime, if run_sc3 is TRUE. The step of runing SC clustering can take five or more hours without using more cores.

repo_dir  = "~/research/GitHub/scRNAseq_pipelines"
work_dir  = file.path(repo_dir,"dronc")
dronc_dir = "~/research/scRNAseq/data/GTEx_droncseq_hip_pcf"
# repo_dir = "/pine/scr/p/l/pllittle/CS_eQTL/s3_Real/scRNAseq_pipelines"
# work_dir = file.path(repo_dir,"dronc")
# dronc_dir = file.path(work_dir,"GTEx_droncseq_hip_pcf")

source(file.path(repo_dir,"SOURCE.R"))

run_sc3   = TRUE
num_cores = ifelse(run_sc3,7,1)

2 Obtaining/Loading Counts

Next we import in the count data and other available information. The dataset is available here.

counts_fn  = file.path(dronc_dir,"GTEx_droncseq_hip_pcf.umi_counts.txt.gz")
counts     = fread(cmd=sprintf("zcat < %s",counts_fn),data.table = FALSE)
dim(counts); counts[1:3,1:2]
## [1] 32111 14964
##         V1 hHP1_AACACTATCTAC
## 1     A1BG                 0
## 2 A1BG-AS1                 0
## 3     A1CF                 0
rownames(counts) = counts$V1
counts = as.matrix(counts[,-1])
colnames(counts)[1:10]
##  [1] "hHP1_AACACTATCTAC" "hHP1_CTACGCATCCAT" "hHP1_TCGGTACTAATA"
##  [4] "hHP1_CCCGCACGCTAT" "hHP1_TCATTTTGTCAT" "hHP1_ACGAGGTCTATG"
##  [7] "hHP1_AGTCATGAGGTT" "hHP1_GTTAGTATACCA" "hHP1_GCATTCAGTAAG"
## [10] "hHP1_AGACCGCGACTA"
part_cell_id = sapply(colnames(counts), 
                      function(xx) strsplit(xx,"_")[[1]][1],
                      USE.NAMES=FALSE)

col_dat = smart_df(sample_name = colnames(counts), 
                   part_cell_id = part_cell_id)
col_dat[1:5,]
##         sample_name part_cell_id
## 1 hHP1_AACACTATCTAC         hHP1
## 2 hHP1_CTACGCATCCAT         hHP1
## 3 hHP1_TCGGTACTAATA         hHP1
## 4 hHP1_CCCGCACGCTAT         hHP1
## 5 hHP1_TCATTTTGTCAT         hHP1

Import clustering results by (Habib et al. 2017). We utilize the supplemental files to label clusters and incorporate the existing TSNE results. Refer to the file nmeth.4407-S10.xlsx.

clust_fn = file.path(dronc_dir,"GTEx_droncseq_hip_pcf.clusters.txt.gz")
clust    = fread(cmd = sprintf("zcat < %s",clust_fn),data.table = FALSE)
dim(clust); clust[1:5,]
## [1] 14963     2
##                  V1 V2
## 1 hHP1_AACACTATCTAC  4
## 2 hHP1_CTACGCATCCAT  3
## 3 hHP1_TCGGTACTAATA  3
## 4 hHP1_CCCGCACGCTAT  3
## 5 hHP1_TCATTTTGTCAT  3
names(clust) = c("sample_name","Habib_clusters")
clust$Habib_clusters = as.factor(clust$Habib_clusters)

# Double check sample names
all(col_dat$sample_name == clust$sample_name)
## [1] TRUE
map_clust_name = smart_df(Habib_clusters = as.factor(seq(19)),
    Habib_clusters_name = c("exPFC1","exPFC2","exCA1","exCA3",
        "GABA1","GABA2","exDG","ASC1","ASC2","ODC1","ODC2",
        "OPC","MG","NSC","END",rep(NA,4)),
    Habib_cell_type = c("exPFC","exPFC","exCA","exCA",
        "GABA","GABA","exDG","ASC","ASC","ODC","ODC","OPC",
        "MG","NSC","END",rep(NA,4)))
clust = smart_merge(clust,map_clust_name)
clust = clust[match(col_dat$sample_name,clust$sample_name),]
clust[1:5,]
##      Habib_clusters       sample_name Habib_clusters_name Habib_cell_type
## 9142              4 hHP1_AACACTATCTAC               exCA3            exCA
## 8826              3 hHP1_CTACGCATCCAT               exCA1            exCA
## 8732              3 hHP1_TCGGTACTAATA               exCA1            exCA
## 8735              3 hHP1_CCCGCACGCTAT               exCA1            exCA
## 8731              3 hHP1_TCATTTTGTCAT               exCA1            exCA

We marge the TSNE results from (Habib et al. 2017) into the sce object. We also check for spike-ins, and as expected, there is no spike-ins in this dataset.

tsne_fn = file.path(dronc_dir,"GTEx_droncseq_hip_pcf.tsne.txt.gz")
df_tsne = fread(cmd = sprintf("zcat < %s",tsne_fn),data.table = FALSE)
dim(df_tsne); df_tsne[1:5,]
## [1] 14963     3
##                  V1     tSNE_1   tSNE_2
## 1 hHP1_AACACTATCTAC  -8.436537 7.721106
## 2 hHP1_CTACGCATCCAT  -6.606278 8.099538
## 3 hHP1_TCGGTACTAATA  -6.724967 8.011943
## 4 hHP1_CCCGCACGCTAT  -6.286364 7.892359
## 5 hHP1_TCATTTTGTCAT -10.455725 2.374719
names(df_tsne) = c("sample_name",paste0("Habib_TSNE",1:2))
table(df_tsne$sample_name == clust$sample_name)
## 
##  TRUE 
## 14963
table(df_tsne$sample_name == clust$sample_name)
## 
##  TRUE 
## 14963
is_match = (col_dat$sample_name == df_tsne$sample_name) & 
    (df_tsne$sample_name == clust$sample_name); table(is_match)
## is_match
##  TRUE 
## 14963
col_dat = cbind(col_dat,
                clust[,names(clust) != "sample_name"],
                df_tsne[,names(df_tsne) != "sample_name"])
col_dat[1:5,]
##            sample_name part_cell_id Habib_clusters Habib_clusters_name
## 9142 hHP1_AACACTATCTAC         hHP1              4               exCA3
## 8826 hHP1_CTACGCATCCAT         hHP1              3               exCA1
## 8732 hHP1_TCGGTACTAATA         hHP1              3               exCA1
## 8735 hHP1_CCCGCACGCTAT         hHP1              3               exCA1
## 8731 hHP1_TCATTTTGTCAT         hHP1              3               exCA1
##      Habib_cell_type Habib_TSNE1 Habib_TSNE2
## 9142            exCA   -8.436537    7.721106
## 8826            exCA   -6.606278    8.099538
## 8732            exCA   -6.724967    8.011943
## 8735            exCA   -6.286364    7.892359
## 8731            exCA  -10.455725    2.374719
sce = SingleCellExperiment(assays = list(counts = counts),colData = col_dat)
sce
## class: SingleCellExperiment 
## dim: 32111 14963 
## metadata(0):
## assays(1): counts
## rownames(32111): A1BG A1BG-AS1 ... ZZEF1 ZZZ3
## rowData names(0):
## colnames(14963): hHP1_AACACTATCTAC hHP1_CTACGCATCCAT ...
##   PFC-CD_CTCCATTCATGC PFC-CD_CGTCATTAGCAG
## colData names(7): sample_name part_cell_id ... Habib_TSNE1
##   Habib_TSNE2
## reducedDimNames(0):
## spikeNames(0):
rownames(sce)[grep("^ERCC",rownames(sce))]
## [1] "ERCC1"   "ERCC2"   "ERCC3"   "ERCC4"   "ERCC5"   "ERCC6"   "ERCC6L" 
## [8] "ERCC6L2" "ERCC8"

3 Pre-processing and Quality Control

3.1 Gene Annotation

We will extract annotation information based on gene names. Since the existing count data from (Habib et al. 2017) were based on mapping to hg19, we use this version of reference genome.

anno_file = file.path(work_dir,"gene.annotation_dronc.rds")
if( file.exists(anno_file) ){

    gene_anno = readRDS(anno_file)

} else{

    ensembl = useEnsembl(biomart="ensembl",GRCh=37,
        dataset="hsapiens_gene_ensembl")

    attr_string = c("hgnc_symbol","external_gene_name","chromosome_name",
        "start_position","end_position","strand","description",
        "percentage_gene_gc_content","gene_biotype")

    gene_anno = getBM(attributes = attr_string,
        filters = "external_gene_name",
        values = rownames(sce),
        mart = ensembl)
    saveRDS(gene_anno, file=anno_file)
    
}

dim(sce); dim(gene_anno)
## [1] 32111 14963
## [1] 34526     9

We remove those genes that are part of extracted annotation, but aren’t in sce.

w2rm = which(!gene_anno$external_gene_name %in% rownames(sce))
w2rm
## [1] 4232 4645
gene_anno[w2rm,]
##      hgnc_symbol external_gene_name chromosome_name start_position
## 4232                       C10ORF68              10       32832297
## 4645                       C1ORF220               1      178511950
##      end_position strand
## 4232     33171802      1
## 4645    178517579      1
##                                                                                                               description
## 4232 Homo sapiens coiled-coil domain containing 7 (CCDC7), transcript variant 5, mRNA. [Source:RefSeq mRNA;Acc:NM_024688]
## 4645                                                                                                                     
##      percentage_gene_gc_content   gene_biotype
## 4232                      37.23 protein_coding
## 4645                      49.91 protein_coding
gene_anno = gene_anno[-w2rm,]
dim(sce); dim(gene_anno)
## [1] 32111 14963
## [1] 34524     9

Many genes have multiple entries in the annotation, often because they are annotated to scaffolds, assembly patches and alternate loci. Here we simply remove such entries. The we remove duplicated annotations and genes without annotations.

table(gene_anno$chromosome_name)[1:30]
## 
##          1         10         11         12         13         14 
##       3084       1252       1741       1694        637       1145 
##         15         16         17         18         19          2 
##       1169       1426       1820        633       1955       2186 
##         20         21         22          3          4          5 
##        800        383        704       1784       1301       1567 
##          6          7          8          9 GL000192.1 GL000193.1 
##       1622       1446       1296       1207          2          1 
## GL000194.1 GL000195.1 GL000199.1 GL000204.1 GL000205.1 GL000213.1 
##          2          1          1          1          2          1
chr_nms = c(1:22,"X","Y","MT")
gene_anno = gene_anno[which(gene_anno$chromosome_name %in% chr_nms),]
dim(sce); dim(gene_anno)
## [1] 32111 14963
## [1] 31978     9
t1 = table(gene_anno$external_gene_name)
t2 = sort(t1[t1 > 1], decreasing=TRUE)
length(t2)
## [1] 40
t2[1:10]
## 
##     SNORD113    MIR1302-2       NPIPA7      CCDC177        CDRT1 
##            6            4            3            2            2 
##    CEBPA-AS1      FAM226B FAM47E-STBD1         GATS      GOLGA7B 
##            2            2            2            2            2
gene_anno[which(gene_anno$external_gene_name %in% names(t2)[1:4]), 1:4]
##       hgnc_symbol external_gene_name chromosome_name start_position
## 5013      CCDC177            CCDC177              14       70037483
## 5014                         CCDC177              14       70038216
## 14832  MIR1302-11          MIR1302-2              19          71161
## 14833  MIR1302-10          MIR1302-2              19          71161
## 14834   MIR1302-9          MIR1302-2              19          71161
## 14835   MIR1302-2          MIR1302-2              19          71161
## 16502                         NPIPA7              16       16411301
## 16503      NPIPA7             NPIPA7              16       16472912
## 16505                         NPIPA7              16       18451943
## 29789                       SNORD113              14      101422577
## 29810                       SNORD113              14      101443726
## 29811                       SNORD113              14      101445339
## 29812                       SNORD113              14      101446329
## 29825                       SNORD113              14      101460594
## 29826                       SNORD113              14      101464804
w_duplicate = which(gene_anno$external_gene_name %in% names(t2))
ganno2 = gene_anno[w_duplicate,]
dim(ganno2)
## [1] 87  9
table(ganno2$hgnc_symbol == ganno2$external_gene_name)
## 
## FALSE  TRUE 
##    46    41
ganno2 = ganno2[which(ganno2$hgnc_symbol == ganno2$external_gene_name),]
dim(ganno2)
## [1] 41  9
ganno2 = dplyr::distinct(ganno2,external_gene_name,.keep_all = TRUE)
dim(ganno2)
## [1] 39  9
gene_anno = rbind(gene_anno[-w_duplicate,], ganno2)
dim(gene_anno)
## [1] 31930     9
table(gene_anno$gene_biotype)
## 
## 3prime_overlapping_ncrna                antisense                IG_C_gene 
##                       11                     4152                        3 
##                IG_V_gene                  lincRNA                    miRNA 
##                        6                     4807                      966 
##                 misc_RNA                  Mt_rRNA                  Mt_tRNA 
##                      935                        2                       18 
##     processed_transcript           protein_coding               pseudogene 
##                      394                    18296                        3 
##                     rRNA           sense_intronic        sense_overlapping 
##                      204                      648                      172 
##                   snoRNA                    snRNA                TR_C_gene 
##                      201                     1083                        5 
##                TR_J_gene                TR_V_gene 
##                        7                       17
gene_missing = setdiff(rownames(sce),gene_anno$external_gene_name)
gene_missing[1:10]
##  [1] "AC005152.2" "AC006132.1" "AC006445.8" "AC007092.1" "AC007390.5"
##  [6] "AC007464.1" "AC009232.2" "AC009236.1" "AC010127.5" "AC011043.1"
length(gene_missing)
## [1] 181
w2kp = match(gene_anno$external_gene_name,rownames(sce))
table(is.na(w2kp))
## 
## FALSE 
## 31930
gene_anno$external_gene_name[which(is.na(w2kp))]
## character(0)
sce = sce[w2kp,]

dim(sce)
## [1] 31930 14963
rowData(sce) = gene_anno

sce
## class: SingleCellExperiment 
## dim: 31930 14963 
## metadata(0):
## assays(1): counts
## rownames(31930): AC006946.16 AC006946.17 ... ZNF8 ZNF788
## rowData names(9): hgnc_symbol external_gene_name ...
##   percentage_gene_gc_content gene_biotype
## colnames(14963): hHP1_AACACTATCTAC hHP1_CTACGCATCCAT ...
##   PFC-CD_CTCCATTCATGC PFC-CD_CGTCATTAGCAG
## colData names(7): sample_name part_cell_id ... Habib_TSNE1
##   Habib_TSNE2
## reducedDimNames(0):
## spikeNames(0):
rowData(sce)[1:5,]
## DataFrame with 5 rows and 9 columns
##   hgnc_symbol external_gene_name chromosome_name start_position
##   <character>        <character>     <character>      <integer>
## 1                    AC006946.16              22       17548711
## 2                    AC006946.17              22       17561591
## 3                    AC004019.13              22       18062923
## 4       ABHD3              ABHD3              18       19230858
## 5                    AC004471.10              22       19111822
##   end_position    strand
##      <integer> <integer>
## 1     17551565         1
## 2     17562346         1
## 3     18071958        -1
## 4     19284766        -1
## 5     19115962        -1
##                                                      description
##                                                      <character>
## 1                                                               
## 2                                                               
## 3                                                               
## 4 abhydrolase domain containing 3 [Source:HGNC Symbol;Acc:18718]
## 5                                                               
##   percentage_gene_gc_content   gene_biotype
##                    <numeric>    <character>
## 1                      41.61        lincRNA
## 2                      41.67        lincRNA
## 3                       52.6      antisense
## 4                      42.45 protein_coding
## 5                       58.3        lincRNA

3.2 Identify Low quality cells

3.2.1 barcodeRanks filtering

Please refer to this workflow in bioconductor for reference.

bcrank = barcodeRanks(counts(sce))
# Only show unique points for plotting speed.
uniq = !duplicated(bcrank$rank)

par(mar=c(5,4,2,1), bty="n")
plot(bcrank$rank[uniq], bcrank$total[uniq], log="xy", 
    xlab="Rank", ylab="Total UMI count", cex=0.5, cex.lab=1.2)
abline(h=bcrank$inflection, col="darkgreen", lty=2,lwd=2)
abline(h=bcrank$knee, col="dodgerblue", lty=2,lwd=2)

legend("left", legend=c("Inflection", "Knee"), bty="n", 
    col=c("darkgreen", "dodgerblue"), lty=2, cex=1.2,lwd=2)

bcrank$inflection
## hCf_TAATAGCGGTAC 
##              221
bcrank$knee
## PFC2-A2_CACAGCGCTGTC 
##                  580
summary(bcrank$total)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   218.0   459.0   716.0   948.9  1167.0  9423.0
table(bcrank$total >= bcrank$knee)
## 
## FALSE  TRUE 
##  5513  9450
table(bcrank$total >= bcrank$inflection)
## 
## FALSE  TRUE 
##     1 14962
set.seed(100)
date()
## [1] "Wed Jan 30 11:52:07 2019"
e_out = emptyDrops(counts(sce))
date()
## [1] "Wed Jan 30 11:52:28 2019"
e_out
## DataFrame with 14963 rows and 5 columns
##                         Total   LogProb    PValue   Limited       FDR
##                     <integer> <numeric> <numeric> <logical> <numeric>
## hHP1_AACACTATCTAC        4484       NaN         1     FALSE         0
## hHP1_CTACGCATCCAT        4919       NaN         1     FALSE         0
## hHP1_TCGGTACTAATA        5163       NaN         1     FALSE         0
## hHP1_CCCGCACGCTAT        5030       NaN         1     FALSE         0
## hHP1_TCATTTTGTCAT        3588       NaN         1     FALSE         0
## ...                       ...       ...       ...       ...       ...
## PFC-CD_TTGCCTGGCGGG       590       NaN         1     FALSE         0
## PFC-CD_CACGCTCCCCTA       269       NaN         1     FALSE         1
## PFC-CD_GCTCTACAACCG       522       NaN         1     FALSE         1
## PFC-CD_CTCCATTCATGC       497       NaN         1     FALSE         1
## PFC-CD_CGTCATTAGCAG       568       NaN         1     FALSE         1
length(unique(e_out$FDR))
## [1] 2
table(e_out$FDR)
## 
##    0    1 
## 9450 5513
tapply(e_out$Total, e_out$FDR, summary)
## $`0`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     581     748    1011    1268    1487    9423 
## 
## $`1`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   218.0   317.0   391.0   402.3   489.0   580.0

From the above analysis, some cells with very small number of UMIs. Here we chose do not remove any cells because it appears all these 14,963 cells were used in the main anlaysis by (Habib et al. 2017). Based on Figure 2 of their paper, there are

14,963 DroNc-seq nuclei profiles (each with >10,000 reads and >200 genes)

3.2.2 Incorporate information of mitochondira/ribosomal genes in QC metrics

We will generate a set of QC features per cell, including the expression of mitochondira/ribosomal genes. We identify ribosomal genes based on annoation from https://www.genenames.org/.

ribo_fn = file.path(work_dir,"ribosome_genes.txt")
ribo    = smart_RT(ribo_fn,sep='\t',header=TRUE)
ribo[1:2,]
##   HGNC.ID Approved.Symbol          Approved.Name   Status Previous.Symbols
## 1   10298           RPL10  ribosomal protein L10 Approved                 
## 2   10299          RPL10A ribosomal protein L10a Approved            NEDD6
##                                  Synonyms Chromosome Accession.Numbers
## 1 NOV, QM, DXS648E, DXS648, FLJ23544, L10       Xq28          AB007170
## 2                            Csa-19, L10A    6p21.31            U12404
##   RefSeq.IDs Gene.Family.Tag Gene.family.description Gene.family.ID
## 1  NM_006013             RPL    L ribosomal proteins            729
## 2  NM_007104             RPL    L ribosomal proteins            729
is_mito = which(rowData(sce)$chromosome_name == "MT")
is_ribo = which(rowData(sce)$external_gene_name %in% ribo$Approved.Symbol)
length(is_mito)
## [1] 33
length(is_ribo)
## [1] 160
sce = calculateQCMetrics(sce, feature_controls=list(Mt=is_mito, Ri=is_ribo))
sort(colnames(colData(sce)))
##  [1] "Habib_cell_type"                               
##  [2] "Habib_clusters"                                
##  [3] "Habib_clusters_name"                           
##  [4] "Habib_TSNE1"                                   
##  [5] "Habib_TSNE2"                                   
##  [6] "is_cell_control"                               
##  [7] "log10_total_counts"                            
##  [8] "log10_total_counts_endogenous"                 
##  [9] "log10_total_counts_feature_control"            
## [10] "log10_total_counts_Mt"                         
## [11] "log10_total_counts_Ri"                         
## [12] "log10_total_features"                          
## [13] "log10_total_features_by_counts"                
## [14] "log10_total_features_by_counts_endogenous"     
## [15] "log10_total_features_by_counts_feature_control"
## [16] "log10_total_features_by_counts_Mt"             
## [17] "log10_total_features_by_counts_Ri"             
## [18] "log10_total_features_endogenous"               
## [19] "log10_total_features_feature_control"          
## [20] "log10_total_features_Mt"                       
## [21] "log10_total_features_Ri"                       
## [22] "part_cell_id"                                  
## [23] "pct_counts_endogenous"                         
## [24] "pct_counts_feature_control"                    
## [25] "pct_counts_in_top_100_features"                
## [26] "pct_counts_in_top_100_features_endogenous"     
## [27] "pct_counts_in_top_100_features_feature_control"
## [28] "pct_counts_in_top_100_features_Ri"             
## [29] "pct_counts_in_top_200_features"                
## [30] "pct_counts_in_top_200_features_endogenous"     
## [31] "pct_counts_in_top_50_features"                 
## [32] "pct_counts_in_top_50_features_endogenous"      
## [33] "pct_counts_in_top_50_features_feature_control" 
## [34] "pct_counts_in_top_50_features_Ri"              
## [35] "pct_counts_in_top_500_features"                
## [36] "pct_counts_in_top_500_features_endogenous"     
## [37] "pct_counts_Mt"                                 
## [38] "pct_counts_Ri"                                 
## [39] "pct_counts_top_100_features"                   
## [40] "pct_counts_top_100_features_endogenous"        
## [41] "pct_counts_top_100_features_feature_control"   
## [42] "pct_counts_top_100_features_Ri"                
## [43] "pct_counts_top_200_features"                   
## [44] "pct_counts_top_200_features_endogenous"        
## [45] "pct_counts_top_50_features"                    
## [46] "pct_counts_top_50_features_endogenous"         
## [47] "pct_counts_top_50_features_feature_control"    
## [48] "pct_counts_top_50_features_Ri"                 
## [49] "pct_counts_top_500_features"                   
## [50] "pct_counts_top_500_features_endogenous"        
## [51] "sample_name"                                   
## [52] "total_counts"                                  
## [53] "total_counts_endogenous"                       
## [54] "total_counts_feature_control"                  
## [55] "total_counts_Mt"                               
## [56] "total_counts_Ri"                               
## [57] "total_features"                                
## [58] "total_features_by_counts"                      
## [59] "total_features_by_counts_endogenous"           
## [60] "total_features_by_counts_feature_control"      
## [61] "total_features_by_counts_Mt"                   
## [62] "total_features_by_counts_Ri"                   
## [63] "total_features_endogenous"                     
## [64] "total_features_feature_control"                
## [65] "total_features_Mt"                             
## [66] "total_features_Ri"
par(mfrow=c(2,2), mar=c(5, 4, 1, 1), bty="n")
hist(log10(sce$total_counts), xlab="log10(Library sizes)", main="", 
    breaks=20, col="grey80", ylab="Number of cells")
hist(log10(sce$total_features), xlab="log10(# of expressed genes)", 
    main="", breaks=20, col="grey80", ylab="Number of cells")
hist(sce$pct_counts_Ri, xlab="Ribosome prop. (%)",
    ylab="Number of cells", breaks=40, main="", col="grey80")
hist(sce$pct_counts_Mt, xlab="Mitochondrial prop. (%)", 
    ylab="Number of cells", breaks=80, main="", col="grey80")

par(mfrow=c(2,2), mar=c(5, 4, 1, 1), bty="n")
smoothScatter(log10(sce$total_counts), log10(sce$total_features), 
    xlab="log10(Library sizes)", ylab="log10(# of expressed genes)")
smoothScatter(log10(sce$total_counts), sce$pct_counts_Ri,
    xlab="log10(Library sizes)", ylab="Ribosome prop. (%)")
smoothScatter(log10(sce$total_counts), sce$pct_counts_Mt,
    xlab="log10(Library sizes)", ylab="Mitochondrial prop. (%)")
smoothScatter(sce$pct_counts_Ri,sce$pct_counts_Mt,
    xlab="Ribosome prop. (%)", ylab="Mitochondrial prop. (%)")

From the QC results, we will filter on the metrics by identifying outliers using isOutlier. Note that the cells assigned to cluster 18 by (Habib et al. 2017) will all be excluded.

libsize_drop = isOutlier(sce$total_counts,nmads = 3,type = "lower",log = TRUE)
feature_drop = isOutlier(sce$total_features_by_counts,nmads = 3,
                         type = "lower",log = TRUE)
mito_drop = isOutlier(sce$pct_counts_Mt,nmads = 3,type = "higher")
ribo_drop = isOutlier(sce$pct_counts_Ri,nmads = 3,type = "higher")
keep = !(libsize_drop | feature_drop | mito_drop | ribo_drop)
smart_df(ByLibSize = sum(libsize_drop),ByFeature = sum(feature_drop),
        ByMito = sum(mito_drop),ByRibo = sum(ribo_drop),
        Remaining = sum(keep))
##   ByLibSize ByFeature ByMito ByRibo Remaining
## 1         0         0   1193    735     13181
smart_table(colData(sce)$Habib_clusters,keep)
##     keep
##      FALSE TRUE
##   1     56 3048
##   2      4  293
##   3     31  390
##   4      8  737
##   5     18  874
##   6     51  772
##   7     32 1421
##   8    126 1078
##   9    124  581
##   10   234 1484
##   11    92 1155
##   12   106  578
##   13   108  281
##   14    40  161
##   15   279  120
##   16   171   83
##   17   126   74
##   18   132    0
##   19    44   51
smart_table(colData(sce)$Habib_clusters_name,keep)
##         keep
##          FALSE TRUE
##   ASC1     126 1078
##   ASC2     124  581
##   END      279  120
##   exCA1     31  390
##   exCA3      8  737
##   exDG      32 1421
##   exPFC1    56 3048
##   exPFC2     4  293
##   GABA1     18  874
##   GABA2     51  772
##   MG       108  281
##   NSC       40  161
##   ODC1     234 1484
##   ODC2      92 1155
##   OPC      106  578
##   <NA>     473  208
sce = sce[,keep]
dim(sce)
## [1] 31930 13181

3.3 Summarize gene level information

rowData(sce)[1:2,]
## DataFrame with 2 rows and 20 columns
##   hgnc_symbol external_gene_name chromosome_name start_position
##   <character>        <character>     <character>      <integer>
## 1                    AC006946.16              22       17548711
## 2                    AC006946.17              22       17561591
##   end_position    strand description percentage_gene_gc_content
##      <integer> <integer> <character>                  <numeric>
## 1     17551565         1                                  41.61
## 2     17562346         1                                  41.67
##   gene_biotype is_feature_control is_feature_control_Mt
##    <character>          <logical>             <logical>
## 1      lincRNA              FALSE                 FALSE
## 2      lincRNA              FALSE                 FALSE
##   is_feature_control_Ri         mean_counts   log10_mean_counts
##               <logical>           <numeric>           <numeric>
## 1                 FALSE 0.00548018445498897  0.0023735161394708
## 2                 FALSE 0.00360890195816347 0.00156450482886486
##   n_cells_by_counts pct_dropout_by_counts total_counts log10_total_counts
##           <integer>             <numeric>    <integer>          <numeric>
## 1                79      99.4720310098242           82   1.91907809237607
## 2                52      99.6524761077324           54   1.74036268949424
##   n_cells_counts pct_dropout_counts
##        <integer>          <numeric>
## 1             79   99.4720310098242
## 2             52   99.6524761077324
summary(rowData(sce)$mean_counts)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##   0.00007   0.00033   0.00287   0.02972   0.02484 107.63510
summary(rowData(sce)$mean_counts[rowData(sce)$mean_counts>0])
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##   0.00007   0.00033   0.00287   0.02972   0.02484 107.63510
summary(rowData(sce)$n_cells_counts)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0     5.0    40.5   307.8   341.0 13416.0
par(mfrow=c(1,3), mar=c(5,4,1,1))
hist(log10(rowData(sce)$mean_counts+1e-6), col="grey80",  main="", 
    breaks=40, xlab="log10(ave # of UMI + 1e-6)")
hist(log10(rowData(sce)$n_cells_counts+1), col="grey80", main="", 
    breaks=40, xlab="log10(# of expressed cells + 1)")
plot(log10(rowData(sce)$mean_counts+1e-6), pch=16, col=rgb(0,0,0,0.4), 
    log10(rowData(sce)$n_cells_counts + 1), 
    xlab="log10(ave # of UMI + 1e-6)", 
    ylab="log10(# of expressed cells + 1)")

tb1 = table(rowData(sce)$n_cells_counts)
tb1[1:11]
## 
##    1    2    3    4    5    6    7    8    9   10   11 
## 3302 1958 1280 1029  843  667  552  525  473  383  369

We remove those genes that are lowly expressed. (Habib et al. 2017) mentioned in section “Gene detection and quality controls” of Online Methods

Nuclei with less than 200 detected genes and less than 10,000 usable reads were filtered out.

and

A gene is considered detected in a cell if it has at least two unique UMIs (transcripts) associated with it. For each analysis, genes were removed that were detected in less than 10 nuclei.

Therefore it seems we should remove all the nuclei. having less than 200 genes with two or more UMI counts. However, this would remove the majority of the nuclei. Therefore, we conlcude that they meant to remove the nuclei. having less than 200 genes with one or more UMI counts. To filter genes, we follow their threshold to remove genes with two or more UMIs in less than 10 nuclei.

Note that the variable strand need to be renamed, otherwise there is an error message that such a variabel name cannot be used.

names(rowData(sce))[names(rowData(sce)) == "strand"] = "strand_n"

n_genes = colSums(counts(sce) >= 2)
summary(n_genes)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    10.0    43.0    71.0   112.5   133.0  1555.0
table(n_genes >= 100)
## 
## FALSE  TRUE 
##  8440  4741
table(n_genes >= 200)
## 
## FALSE  TRUE 
## 11339  1842
n_genes = colSums(counts(sce) >= 1)
summary(n_genes)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   198.0   351.0   540.0   670.5   845.0  4233.0
table(n_genes >= 100)
## 
##  TRUE 
## 13181
table(n_genes >= 200)
## 
## FALSE  TRUE 
##     2 13179
n_cells = rowSums(counts(sce) >= 2)
summary(n_cells)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     0.00     0.00     1.00    46.45    22.00 11725.00
table(n_cells >= 10)
## 
## FALSE  TRUE 
## 21323 10607
sce = sce[which(n_cells >= 10),]
dim(sce)
## [1] 10607 13181

Next we check those highly expressed genes

par(mar=c(5,4,1,1))
od1 = order(rowData(sce)$mean_counts, decreasing = TRUE)
barplot(rowData(sce)$mean_counts[od1[20:1]], las=1, 
        names.arg=rowData(sce)$hgnc_symbol[od1[20:1]], 
        horiz=TRUE, cex.names=0.8, cex.axis=0.8, 
        xlab="ave # of UMI")

3.4 Normalization

A simple solution for normalization and stablizing expression varaince across genes is to tranform the count data by log(count/size.factor + 1). One may calcualte size.factor per cell as the total number of UMIs, and this assumes the total expression are the same across all the cells. However, the total expression of each cell may vary with respect to cell type and/or cell size, and the computeSumFactors function in R package scran provides a more sophisicated way to calcualte size.factor to allow such variaation across cells (Lun, Bach, and Marioni 2016). computeSumFactors can use initial clustering of cells to normalize expression within and beetween clusters. Within a cluster, it estimates the size factor for many groups of cells so that there are more groups than cells, and then it can calcualte the size factor per cell using a lienar deconvolution system. We remove all the cells with negative or very small size factors (< 0.01).

As shown in the following plot, the final size factor estimation is indeed highly correlated with the naive definition by total count.

Finally, the command normalize(sce) adds the normalized expression into the variable sce, which can be accessed by logcounts(sce) = log2(gene_cell_count / size_factor + 1).

date()
## [1] "Wed Jan 30 11:53:13 2019"
clusters = quickCluster(sce, min.mean=0.1, method="igraph")
table(clusters)
## clusters
##    1    2    3    4    5    6 
##  503 1590 1591 2864 2062 4571
date()
## [1] "Wed Jan 30 11:53:57 2019"
sce = computeSumFactors(sce, cluster=clusters, min.mean=0.1)
date()
## [1] "Wed Jan 30 11:55:52 2019"
summary(sizeFactors(sce))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0927  0.4487  0.7515  1.0000  1.2610  9.7585
sort(sizeFactors(sce))[1:30]
##  [1] 0.09269791 0.10089499 0.10199474 0.11052233 0.11186730 0.11451162
##  [7] 0.11579893 0.11606944 0.11686686 0.11799359 0.12004139 0.12039394
## [13] 0.12144386 0.12297642 0.12451394 0.12485572 0.12649517 0.12703992
## [19] 0.13203308 0.13261365 0.13403081 0.13450431 0.13482033 0.13840999
## [25] 0.13862154 0.13927866 0.14119765 0.14164402 0.14274874 0.14293551
dim(sce)
## [1] 10607 13181
sce = sce[,which(sizeFactors(sce) > 0.01)]
dim(sce)
## [1] 10607 13181
par(mfrow=c(1,2), mar=c(5,4,2,1), bty="n")
smoothScatter(sce$total_counts, sizeFactors(sce), log="xy", 
                xlab="total counts", ylab="size factors")
plot(sce$total_counts, sizeFactors(sce), log="xy", 
    xlab="total counts", ylab="size factors", 
    cex=0.3, pch=20, col=rgb(0.1,0.2,0.7,0.3))
abline(h=0.05)

dim(sce)
## [1] 10607 13181
sce = sce[,which(sizeFactors(sce) > 0.05)]
dim(sce)
## [1] 10607 13181
sce = normalize(sce) 

3.5 Dimension Reduction

For dimension reduction, such as calculating PCA or performing TSNE, we should start by identifying a subset of genes with high level of biological signal relative to background (technical) noise. These genes are referred to as HVGs (highly variable genes). The decomposeVar function from R/cran is designed for this task.

new_trend = makeTechTrend(x=sce)
fit = trendVar(sce, use.spikes=FALSE, loess.args=list(span=0.05))

par(mfrow=c(1,1), mar=c(5,4,2,1), bty="n")
plot(fit$mean, fit$var, pch=20, col=rgb(0.1,0.2,0.7,0.6), 
    xlab="log(mean)", ylab="var")
curve(fit$trend(x), col="orange", lwd=2, add=TRUE)
curve(new_trend(x), col="red", lwd=2, add=TRUE)
legend("top", legend=c("Poisson noise", "observed trend"), 
        lty=1, lwd=2, col=c("red", "orange"), bty="n")

fit$trend = new_trend

dec     = decomposeVar(fit=fit) 
top_dec = dec[order(dec$bio, decreasing=TRUE),]
plotExpression(sce, features=rownames(top_dec)[1:10])

When performing PCA, we can use all the genes or just those genes with high signal-to-noise ratio (HGVs). TSNE analysis is usually based on the top PCs rather than the original gene expression data. We first perform PCA using all the genes and the function denoisePCA can automatically select the PCs based on modeling of technical noise.

date()
## [1] "Wed Jan 30 11:56:29 2019"
sce = denoisePCA(sce,technical=new_trend,approx=TRUE) # Using the Poisson trend fit
date()
## [1] "Wed Jan 30 11:58:11 2019"
dim(reducedDim(sce,"PCA"))
## [1] 13181    94
par(mfrow=c(1,1))
plot(log10(attr(reducedDim(sce), "percentVar")), xlab="PC",
    ylab="log10(Prop of variance explained)", pch=20, cex=0.6, 
    col=rgb(0.8, 0.2, 0.2, 0.5))
abline(v=ncol(reducedDim(sce,"PCA")), lty=2, col="red")

df_redDim = smart_df(colData(sce)[,c("sample_name","part_cell_id",
    paste0("Habib_TSNE",1:2),"log10_total_features","Habib_clusters",
    "Habib_clusters_name","Habib_cell_type")],
    reducedDim(sce, "PCA")[,1:2])
rownames(df_redDim) = NULL

ggplot_custom(DATA = df_redDim,X = "Habib_TSNE1",Y = "Habib_TSNE2",
              COL = "part_cell_id",TYPE = "cat")

ggplot_custom(DATA = df_redDim,X = "Habib_TSNE1",Y = "Habib_TSNE2",
              COL = "log10_total_features",TYPE = "cont")

ggplot_custom(DATA = df_redDim,X = "Habib_TSNE1",Y = "Habib_TSNE2",
              COL = "Habib_clusters",TYPE = "cat")

ggplot_custom(DATA = df_redDim,X = "Habib_TSNE1",Y = "Habib_TSNE2",
              COL = "Habib_clusters_name",TYPE = "cat")

ggplot_custom(DATA = df_redDim,X = "Habib_TSNE1",Y = "Habib_TSNE2",
              COL = "Habib_cell_type",TYPE = "cat")

date()
## [1] "Wed Jan 30 11:58:16 2019"
sce = runTSNE(sce,use_dimred="PCA",perplexity=30,rand_seed=100)
date()
## [1] "Wed Jan 30 12:00:55 2019"
tmp_df = smart_df(reducedDim(sce,"TSNE"))
rownames(tmp_df) = NULL
names(tmp_df) = paste0("my_TSNE",1:2)
df_redDim = smart_df(df_redDim,tmp_df); rm(tmp_df)
df_redDim[1:5,]
##         sample_name part_cell_id Habib_TSNE1 Habib_TSNE2
## 1 hHP1_AACACTATCTAC         hHP1   -8.436537    7.721106
## 2 hHP1_CTACGCATCCAT         hHP1   -6.606278    8.099538
## 3 hHP1_TCGGTACTAATA         hHP1   -6.724967    8.011943
## 4 hHP1_CCCGCACGCTAT         hHP1   -6.286364    7.892359
## 5 hHP1_TCATTTTGTCAT         hHP1  -10.455725    2.374719
##   log10_total_features Habib_clusters Habib_clusters_name Habib_cell_type
## 1             3.393048              4               exCA3            exCA
## 2             3.453624              3               exCA1            exCA
## 3             3.458638              3               exCA1            exCA
## 4             3.419129              3               exCA1            exCA
## 5             3.345178              3               exCA1            exCA
##        PC1       PC2 my_TSNE1  my_TSNE2
## 1 3.326051 -1.240640 20.77887 11.296457
## 2 3.747268 -1.075909 20.85935  8.749623
## 3 4.543743 -1.682305 22.32118  6.832821
## 4 4.255275 -1.072497 22.64867  7.073299
## 5 3.136629 -1.083864 20.22245  4.447358
ggplot_custom(DATA = df_redDim,X = "my_TSNE1",Y = "my_TSNE2",
              COL = "part_cell_id",TYPE = "cat")

ggplot_custom(DATA = df_redDim,X = "my_TSNE1",Y = "my_TSNE2",
              COL = "log10_total_features",TYPE = "cont")

ggplot_custom(DATA = df_redDim,X = "my_TSNE1",Y = "my_TSNE2",
              COL = "Habib_clusters",TYPE = "cat")

ggplot_custom(DATA = df_redDim,X = "my_TSNE1",Y = "my_TSNE2",
              COL = "Habib_clusters_name",TYPE = "cat")

ggplot_custom(DATA = df_redDim,X = "my_TSNE1",Y = "my_TSNE2",
              COL = "Habib_cell_type",TYPE = "cat")

saveRDS(list(sce = sce,dec = dec,df_redDim = df_redDim),
        file.path(dronc_dir,"post_redDim_all_genes.rds"))

We select around top 1,000 genes (based on FDR and biological residual thresholds) for the PCA and use the top 50 PCs for TSNE projection. When calculating PCs, we use log-transformed normalized gene expression data: log2(norm_express+1).

library(svd)
library(Rtsne)

summary(dec$bio)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.022095  0.000137  0.003208  0.009682  0.008864  5.261563
dec1 = dec
dec1$bio[which(dec$bio < 1e-5)] = 1e-5
dec1$FDR[which(dec$FDR < 1e-100)] = 1e-100

par(mfrow=c(1,2))
hist(log10(dec1$bio),breaks=100,main="")
hist(log10(dec1$FDR),breaks=100,main="")

summary(dec$FDR[dec$bio > 0.01])
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.000e+00 0.000e+00 0.000e+00 1.356e-05 0.000e+00 2.618e-02
summary(dec$FDR[dec$bio > 0.03])
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.000e+00 0.000e+00 0.000e+00 5.310e-10 0.000e+00 3.248e-07
table(dec$FDR < 1e-10,dec$bio > 0.03)
##        
##         FALSE TRUE
##   FALSE  5471    6
##   TRUE   4448  682
table(dec$FDR < 1e-10,dec$bio > 0.01)
##        
##         FALSE TRUE
##   FALSE  5347  130
##   TRUE   2892 2238
table(dec$FDR < 1e-10,dec$bio > 0.02)
##        
##         FALSE TRUE
##   FALSE  5455   22
##   TRUE   4044 1086
w2kp = which(dec$FDR < 1e-10 & dec$bio > 0.02)
sce_hvg = sce[w2kp,]
sce_hvg
## class: SingleCellExperiment 
## dim: 1086 13181 
## metadata(1): log.exprs.offset
## assays(2): counts logcounts
## rownames(1086): AATK AC004158.3 ... ZSWIM6 ZNF704
## rowData names(20): hgnc_symbol external_gene_name ...
##   n_cells_counts pct_dropout_counts
## colnames(13181): hHP1_AACACTATCTAC hHP1_CTACGCATCCAT ...
##   PFC-CD_CACGCTCCCCTA PFC-CD_CGTCATTAGCAG
## colData names(66): sample_name part_cell_id ...
##   pct_counts_top_50_features_Ri pct_counts_top_100_features_Ri
## reducedDimNames(2): PCA TSNE
## spikeNames(0):
edat = t(as.matrix(logcounts(sce_hvg)))
edat = scale(edat)
dim(edat)
## [1] 13181  1086
edat[1:2,1:3]
##                         AATK AC004158.3     ABLIM1
## hHP1_AACACTATCTAC -0.3259988 -0.1787744 -0.3700985
## hHP1_CTACGCATCCAT  0.1885733 -0.1787744 -0.3700985
date()
## [1] "Wed Jan 30 12:01:25 2019"
ppk = propack.svd(edat,neig=50)
date()
## [1] "Wed Jan 30 12:01:36 2019"
pca = t(ppk$d*t(ppk$u)) # calculates pc scores aka principal components

tmp_df = smart_df(pca[,1:2])
names(tmp_df) = paste0("HVG_PC",seq(ncol(tmp_df)))

df_hvg = smart_df(colData(sce)[,c("sample_name","part_cell_id",
    paste0("Habib_TSNE",1:2),"log10_total_features","Habib_clusters",
    "Habib_clusters_name","Habib_cell_type")],tmp_df)
rownames(df_hvg) = NULL

set.seed(100)
date()
## [1] "Wed Jan 30 12:01:36 2019"
tsne = Rtsne(pca, pca = FALSE)
date()
## [1] "Wed Jan 30 12:04:01 2019"
tmp_df = smart_df(tsne$Y)
names(tmp_df) = paste0("HVG_TSNE",seq(ncol(tmp_df)))
df_hvg = smart_df(df_hvg,tmp_df)

ggplot_custom(DATA = df_hvg,X = "HVG_TSNE1",Y = "HVG_TSNE2",
              COL = "part_cell_id",TYPE = "cat")

ggplot_custom(DATA = df_hvg,X = "HVG_TSNE1",Y = "HVG_TSNE2",
              COL = "log10_total_features",TYPE = "cont")

ggplot_custom(DATA = df_hvg,X = "HVG_TSNE1",Y = "HVG_TSNE2",
              COL = "Habib_clusters",TYPE = "cat")

ggplot_custom(DATA = df_hvg,X = "HVG_TSNE1",Y = "HVG_TSNE2",
              COL = "Habib_clusters_name",TYPE = "cat")

ggplot_custom(DATA = df_hvg,X = "HVG_TSNE1",Y = "HVG_TSNE2",
              COL = "Habib_cell_type",TYPE = "cat")

reducedDims(sce_hvg) = SimpleList(PCA=pca, TSNE=tsne$Y)
sce_hvg
## class: SingleCellExperiment 
## dim: 1086 13181 
## metadata(1): log.exprs.offset
## assays(2): counts logcounts
## rownames(1086): AATK AC004158.3 ... ZSWIM6 ZNF704
## rowData names(20): hgnc_symbol external_gene_name ...
##   n_cells_counts pct_dropout_counts
## colnames(13181): hHP1_AACACTATCTAC hHP1_CTACGCATCCAT ...
##   PFC-CD_CACGCTCCCCTA PFC-CD_CGTCATTAGCAG
## colData names(66): sample_name part_cell_id ...
##   pct_counts_top_50_features_Ri pct_counts_top_100_features_Ri
## reducedDimNames(2): PCA TSNE
## spikeNames(0):
saveRDS(sce_hvg,file.path(dronc_dir,"post_HVG.rds"))

3.6 Clustering

3.6.1 Kmeans

Next we cluster the cells using a simple kmeans method on the top 50 PCs. We set the number of clusters to be 5 thru 15, to include the 12 cell types reported by (Habib et al. 2017).

all_num_clust = c(5:15)
df_hvg = df_hvg[,!grepl("^KM_",names(df_hvg))]

for(num_clust in all_num_clust){
    cat(paste0("KM with ",num_clust," clusters.\n"))
    kmeans_out = kmeans(reducedDim(sce_hvg, "PCA"), centers = num_clust, 
                        iter.max = 1e8, nstart = 2500,
                        algorithm = "MacQueen")
    km_label = paste0("KM_",num_clust)
    df_hvg[[km_label]] = as.factor(kmeans_out$cluster)

    print(smart_table(df_hvg$Habib_cell_type,kmeans_out$cluster))
    print(smart_table(df_hvg$Habib_clusters_name,kmeans_out$cluster))
    print(ggplot_custom(DATA = df_hvg,X = "HVG_TSNE1",Y = "HVG_TSNE2", 
                        COL = km_label,TYPE = "cat"))
}
## KM with 5 clusters.
##        
##            1    2    3    4    5
##   ASC     58    0   25 1573    3
##   END    118    0    1    0    1
##   exCA  1113    1    7    6    0
##   exDG  1418    0    3    0    0
##   exPFC 3218    4   61   52    6
##   GABA  1636    1    3    4    2
##   MG      36  211   23    9    2
##   NSC      3  153    0    5    0
##   ODC     49    2 2576    6    6
##   OPC     29    0    7    2  540
##   <NA>   109   71   17   11    0
##         
##             1    2    3    4    5
##   ASC1     39    0   14 1023    2
##   ASC2     19    0   11  550    1
##   END     118    0    1    0    1
##   exCA1   380    1    4    5    0
##   exCA3   733    0    3    1    0
##   exDG   1418    0    3    0    0
##   exPFC1 2967    2   36   37    6
##   exPFC2  251    2   25   15    0
##   GABA1   868    1    3    1    1
##   GABA2   768    0    0    3    1
##   MG       36  211   23    9    2
##   NSC       3  153    0    5    0
##   ODC1     28    2 1445    5    4
##   ODC2     21    0 1131    1    2
##   OPC      29    0    7    2  540
##   <NA>    109   71   17   11    0

## KM with 6 clusters.
##        
##            1    2    3    4    5    6
##   ASC      0 1573   25   58    0    3
##   END      0    0    1  118    0    1
##   exCA     0    7    7 1112    1    0
##   exDG     0    0    3 1418    0    0
##   exPFC    0   52   61 3220    2    6
##   GABA     0    4    3 1636    1    2
##   MG       0    7   13   30  230    1
##   NSC    144    9    0    8    0    0
##   ODC      0    6 2575   49    3    6
##   OPC      0    2    7   29    0  540
##   <NA>     1   12   17  112   66    0
##         
##             1    2    3    4    5    6
##   ASC1      0 1023   14   39    0    2
##   ASC2      0  550   11   19    0    1
##   END       0    0    1  118    0    1
##   exCA1     0    6    4  379    1    0
##   exCA3     0    1    3  733    0    0
##   exDG      0    0    3 1418    0    0
##   exPFC1    0   37   36 2969    0    6
##   exPFC2    0   15   25  251    2    0
##   GABA1     0    1    3  868    1    1
##   GABA2     0    3    0  768    0    1
##   MG        0    7   13   30  230    1
##   NSC     144    9    0    8    0    0
##   ODC1      0    5 1444   28    3    4
##   ODC2      0    1 1131   21    0    2
##   OPC       0    2    7   29    0  540
##   <NA>      1   12   17  112   66    0

## KM with 7 clusters.
##        
##            1    2    3    4    5    6    7
##   ASC     51   25    0    7    3 1573    0
##   END    115    1    0    3    1    0    0
##   exCA  1103    7    1    9    0    7    0
##   exDG  1417    3    0    1    0    0    0
##   exPFC 3167   61    2   53    6   52    0
##   GABA   295    3    1 1344    0    3    0
##   MG      30   13  230    0    1    7    0
##   NSC      7    0    0    0    0   10  144
##   ODC     42 2574    3    8    6    6    0
##   OPC     25    6    0    4  541    2    0
##   <NA>    60   17   66   52    0   12    1
##         
##             1    2    3    4    5    6    7
##   ASC1     34   14    0    5    2 1023    0
##   ASC2     17   11    0    2    1  550    0
##   END     115    1    0    3    1    0    0
##   exCA1   375    4    1    4    0    6    0
##   exCA3   728    3    0    5    0    1    0
##   exDG   1417    3    0    1    0    0    0
##   exPFC1 2919   36    0   50    6   37    0
##   exPFC2  248   25    2    3    0   15    0
##   GABA1   278    3    1  591    0    1    0
##   GABA2    17    0    0  753    0    2    0
##   MG       30   13  230    0    1    7    0
##   NSC       7    0    0    0    0   10  144
##   ODC1     25 1444    3    3    4    5    0
##   ODC2     17 1130    0    5    2    1    0
##   OPC      25    6    0    4  541    2    0
##   <NA>     60   17   66   52    0   12    1

## KM with 8 clusters.
##        
##            1    2    3    4    5    6    7    8
##   ASC   1572   25   15    3    0   42    2    0
##   END      0    1  113    1    0    2    3    0
##   exCA     7    7  828    0    0  274   10    1
##   exDG     0    3 1402    0    0   16    0    0
##   exPFC   52   59   56    5    0 3136   31    2
##   GABA     3    3   15    0    0  452 1172    1
##   MG       7   13   11    1    0   20    0  229
##   NSC      8    0    9    0  144    0    0    0
##   ODC      5 2575   14    6    0   28    8    3
##   OPC      2    7    4  539    0   23    3    0
##   <NA>    12   17   43    0    1   15   54   66
##         
##             1    2    3    4    5    6    7    8
##   ASC1   1024   14    9    2    0   27    2    0
##   ASC2    548   11    6    1    0   15    0    0
##   END       0    1  113    1    0    2    3    0
##   exCA1     6    4  299    0    0   74    6    1
##   exCA3     1    3  529    0    0  200    4    0
##   exDG      0    3 1402    0    0   16    0    0
##   exPFC1   37   35   53    5    0 2890   28    0
##   exPFC2   15   24    3    0    0  246    3    2
##   GABA1     1    3   12    0    0  425  432    1
##   GABA2     2    0    3    0    0   27  740    0
##   MG        7   13   11    1    0   20    0  229
##   NSC       8    0    9    0  144    0    0    0
##   ODC1      5 1445    7    4    0   18    2    3
##   ODC2      0 1130    7    2    0   10    6    0
##   OPC       2    7    4  539    0   23    3    0
##   <NA>     12   17   43    0    1   15   54   66

## KM with 9 clusters.
##        
##            1    2    3    4    5    6    7    8    9
##   ASC      4   15 1570   23   42    2    3    0    0
##   END      0  113    0    1    2    3    1    0    0
##   exCA     0  825    6    9  277    9    0    1    0
##   exDG     0 1402    0    3   16    0    0    0    0
##   exPFC    6   56   48   59 3136   29    5    2    0
##   GABA     0   15    3    3  452 1172    0    1    0
##   MG       1   11    7   13   20    0    1  228    0
##   NSC      0    9    8    0    0    0    0    0  144
##   ODC    902   11    5 1680   25    7    6    3    0
##   OPC      5    4    2    6   23    3  535    0    0
##   <NA>     0   41   12   18   16   54    0   66    1
##         
##             1    2    3    4    5    6    7    8    9
##   ASC1      2    9 1023   13   27    2    2    0    0
##   ASC2      2    6  547   10   15    0    1    0    0
##   END       0  113    0    1    2    3    1    0    0
##   exCA1     0  298    5    6   75    5    0    1    0
##   exCA3     0  527    1    3  202    4    0    0    0
##   exDG      0 1402    0    3   16    0    0    0    0
##   exPFC1    3   54   35   35 2890   26    5    0    0
##   exPFC2    3    2   13   24  246    3    0    2    0
##   GABA1     0   12    1    3  425  432    0    1    0
##   GABA2     0    3    2    0   27  740    0    0    0
##   MG        1   11    7   13   20    0    1  228    0
##   NSC       0    9    8    0    0    0    0    0  144
##   ODC1    708    5    5  741   16    2    4    3    0
##   ODC2    194    6    0  939    9    5    2    0    0
##   OPC       5    4    2    6   23    3  535    0    0
##   <NA>      0   41   12   18   16   54    0   66    1

## KM with 10 clusters.
##        
##            1    2    3    4    5    6    7    8    9   10
##   ASC     42    2    6 1570   15    0    3    0    0   21
##   END      2    3    1    0  113    0    1    0    0    0
##   exCA   282   11    0    6  819    0    0    0    0    9
##   exDG    16    0    0    0 1402    0    0    0    0    3
##   exPFC 3133   32    6   47   56    0    5    2    0   60
##   GABA   452 1173    0    3   14    0    0    0    1    3
##   MG      20    0    0   11   11    0    2  223    0   14
##   NSC      0    0    0    8    9  144    0    0    0    0
##   ODC     25    6  946    6   11    0    6    2    0 1637
##   OPC     23    3    6    2    4    0  535    0    0    5
##   <NA>    16   53    2   13   39    0    0    2   68   15
##         
##             1    2    3    4    5    6    7    8    9   10
##   ASC1     27    2    3 1023    9    0    2    0    0   12
##   ASC2     15    0    3  547    6    0    1    0    0    9
##   END       2    3    1    0  113    0    1    0    0    0
##   exCA1    75    7    0    5  297    0    0    0    0    6
##   exCA3   207    4    0    1  522    0    0    0    0    3
##   exDG     16    0    0    0 1402    0    0    0    0    3
##   exPFC1 2887   29    3   34   54    0    5    0    0   36
##   exPFC2  246    3    3   13    2    0    0    2    0   24
##   GABA1   425  433    0    1   11    0    0    0    1    3
##   GABA2    27  740    0    2    3    0    0    0    0    0
##   MG       20    0    0   11   11    0    2  223    0   14
##   NSC       0    0    0    8    9  144    0    0    0    0
##   ODC1     16    2  725    5    5    0    4    2    0  725
##   ODC2      9    4  221    1    6    0    2    0    0  912
##   OPC      23    3    6    2    4    0  535    0    0    5
##   <NA>     16   53    2   13   39    0    0    2   68   15

## KM with 11 clusters.
##        
##            1    2    3    4    5    6    7    8    9   10   11
##   ASC      0 1067   20    1    0  518    4   12   36    0    1
##   END      0    1    1    3    0    0    0  112    2    0    1
##   exCA     0    9    8   11    0    0    0  814  285    0    0
##   exDG     0    0    3    0    0    0    0 1402   16    0    0
##   exPFC    0   46   54   32    2   10    6   56 3130    0    5
##   GABA     0    4    3 1172    0    0    0   14  452    1    0
##   MG       0    9   14    0  222    2    1   11   20    0    2
##   NSC    144    7    0    0    0    2    0    8    0    0    0
##   ODC      0    7 1683    6    2    0  899   11   25    0    6
##   OPC      0    3    6    3    0    1    5    4   23    0  533
##   <NA>     0   10   17   53    2    2    0   39   16   68    1
##         
##             1    2    3    4    5    6    7    8    9   10   11
##   ASC1      0  842   11    1    0  191    2    6   25    0    0
##   ASC2      0  225    9    0    0  327    2    6   11    0    1
##   END       0    1    1    3    0    0    0  112    2    0    1
##   exCA1     0    7    5    7    0    0    0  296   75    0    0
##   exCA3     0    2    3    4    0    0    0  518  210    0    0
##   exDG      0    0    3    0    0    0    0 1402   16    0    0
##   exPFC1    0   36   31   29    0    6    3   54 2884    0    5
##   exPFC2    0   10   23    3    2    4    3    2  246    0    0
##   GABA1     0    1    3  433    0    0    0   11  425    1    0
##   GABA2     0    3    0  739    0    0    0    3   27    0    0
##   MG        0    9   14    0  222    2    1   11   20    0    2
##   NSC     144    7    0    0    0    2    0    8    0    0    0
##   ODC1      0    6  744    2    2    0  705    5   16    0    4
##   ODC2      0    1  939    4    0    0  194    6    9    0    2
##   OPC       0    3    6    3    0    1    5    4   23    0  533
##   <NA>      0   10   17   53    2    2    0   39   16   68    1

## KM with 12 clusters.
##        
##            1    2    3    4    5    6    7    8    9   10   11   12
##   ASC      2 1066    0    1    4   11   20    0    0  518    3   34
##   END      3    1    0    1    0  105    1    0    0    0    7    2
##   exCA     9    8    0    0    0  202    8    0    0    0  748  152
##   exDG     0    0    0    0    0 1379    3    0    0    0   23   16
##   exPFC   20   46    2    5    6   38   54    0    0   10   53 3107
##   GABA  1154    4    0    0    0    4    3    0    1    0   37  443
##   MG       0    9  223    1    1    9   14    0    0    2    4   18
##   NSC      0    7    0    0    0    6    0  145    0    2    1    0
##   ODC      5    7    2    6  899   10 1683    0    0    0    1   26
##   OPC      3    3    0  533    5    5    6    0    0    1    2   20
##   <NA>    46   10    2    1    0   40   17    0   68    2    5   17
##         
##             1    2    3    4    5    6    7    8    9   10   11   12
##   ASC1      1  841    0    0    2    6   11    0    0  191    2   24
##   ASC2      1  225    0    1    2    5    9    0    0  327    1   10
##   END       3    1    0    1    0  105    1    0    0    0    7    2
##   exCA1     5    7    0    0    0  179    5    0    0    0  118   76
##   exCA3     4    1    0    0    0   23    3    0    0    0  630   76
##   exDG      0    0    0    0    0 1379    3    0    0    0   23   16
##   exPFC1   17   36    0    5    3   36   31    0    0    6   49 2865
##   exPFC2    3   10    2    0    3    2   23    0    0    4    4  242
##   GABA1   414    1    0    0    0    3    3    0    1    0   33  419
##   GABA2   740    3    0    0    0    1    0    0    0    0    4   24
##   MG        0    9  223    1    1    9   14    0    0    2    4   18
##   NSC       0    7    0    0    0    6    0  145    0    2    1    0
##   ODC1      1    6    2    4  705    5  744    0    0    0    0   17
##   ODC2      4    1    0    2  194    5  939    0    0    0    1    9
##   OPC       3    3    0  533    5    5    6    0    0    1    2   20
##   <NA>     46   10    2    1    0   40   17    0   68    2    5   17

## KM with 13 clusters.
##        
##            1    2    3    4    5    6    7    8    9   10   11   12   13
##   ASC      4    3    0 1073    0    0   35    5   11    1   16  509    2
##   END      7    0    0    1    0    0    2    0  105    1    1    0    3
##   exCA   748    0    0    7    0    0  152    0  200    0   11    0    9
##   exDG    23    0    0    0    0    0   15    0 1379    0    4    0    0
##   exPFC   53    6    0   41    2    0 3108   11   37    5   49   10   19
##   GABA    37    0    0    3    0    1  438    0    4    0    5    0 1158
##   MG       4    1    0   10  221    0   16    1   10    1   16    1    0
##   NSC      1    0  144    6    0    0    0    0    7    0    0    3    0
##   ODC      1  688    0    5    1    0   24  719    8    6 1182    0    5
##   OPC      2    5    0    2    0    0   20    0    5  534    6    1    3
##   <NA>     5    0    0   10    2   68   17    3   40    0   14    3   46
##         
##             1    2    3    4    5    6    7    8    9   10   11   12   13
##   ASC1      2    1    0  855    0    0   25    4    6    0    8  176    1
##   ASC2      2    2    0  218    0    0   10    1    5    1    8  333    1
##   END       7    0    0    1    0    0    2    0  105    1    1    0    3
##   exCA1   118    0    0    6    0    0   76    0  177    0    8    0    5
##   exCA3   630    0    0    1    0    0   76    0   23    0    3    0    4
##   exDG     23    0    0    0    0    0   15    0 1379    0    4    0    0
##   exPFC1   49    3    0   31    0    0 2866    1   35    5   36    6   16
##   exPFC2    4    3    0   10    2    0  242   10    2    0   13    4    3
##   GABA1    33    0    0    1    0    1  414    0    3    0    5    0  417
##   GABA2     4    0    0    2    0    0   24    0    1    0    0    0  741
##   MG        4    1    0   10  221    0   16    1   10    1   16    1    0
##   NSC       1    0  144    6    0    0    0    0    7    0    0    3    0
##   ODC1      0  623    0    4    1    0   17  172    3    4  659    0    1
##   ODC2      1   65    0    1    0    0    7  547    5    2  523    0    4
##   OPC       2    5    0    2    0    0   20    0    5  534    6    1    3
##   <NA>      5    0    0   10    2   68   17    3   40    0   14    3   46

## KM with 14 clusters.
##        
##            1    2    3    4    5    6    7    8    9   10   11   12   13
##   ASC      2  770    5    0  453   10   34    0    0  367    0    4   13
##   END      7    1    0    0    0  105    2    1    0    0    0    0    1
##   exCA   748    7    0    0    1  200  152    0    0    0    0    0   11
##   exDG    23    0    0    0    0 1379   15    0    0    0    0    0    4
##   exPFC   53   43   11    0    7   37 3107    5    2    7    0    6   44
##   GABA    37    6    0    0    0    4  440    0    0    0    1    0    5
##   MG       4    9    1    0    2   10   16    2  219    1    0    1   16
##   NSC      1    3    0  145    1    6    0    0    0    5    0    0    0
##   ODC      1    6  718    0    0    8   25    5    1    0    0  689 1181
##   OPC      2    2    0    0    1    5   20  533    0    1    0    5    6
##   <NA>     5   11    3    0    1   39   17    0    2    2   68    0   14
##        
##           14
##   ASC      1
##   END      3
##   exCA     8
##   exDG     0
##   exPFC   19
##   GABA  1153
##   MG       0
##   NSC      0
##   ODC      5
##   OPC      3
##   <NA>    46
##         
##             1    2    3    4    5    6    7    8    9   10   11   12   13
##   ASC1      1  578    4    0  426    5   24    0    0   30    0    1    8
##   ASC2      1  192    1    0   27    5   10    0    0  337    0    3    5
##   END       7    1    0    0    0  105    2    1    0    0    0    0    1
##   exCA1   118    6    0    0    1  177   76    0    0    0    0    0    8
##   exCA3   630    1    0    0    0   23   76    0    0    0    0    0    3
##   exDG     23    0    0    0    0 1379   15    0    0    0    0    0    4
##   exPFC1   49   32    1    0    5   35 2865    5    0    4    0    3   33
##   exPFC2    4   11   10    0    2    2  242    0    2    3    0    3   11
##   GABA1    33    3    0    0    0    3  416    0    0    0    1    0    5
##   GABA2     4    3    0    0    0    1   24    0    0    0    0    0    0
##   MG        4    9    1    0    2   10   16    2  219    1    0    1   16
##   NSC       1    3    0  145    1    6    0    0    0    5    0    0    0
##   ODC1      0    5  169    0    0    3   17    3    1    0    0  625  660
##   ODC2      1    1  549    0    0    5    8    2    0    0    0   64  521
##   OPC       2    2    0    0    1    5   20  533    0    1    0    5    6
##   <NA>      5   11    3    0    1   39   17    0    2    2   68    0   14
##         
##            14
##   ASC1      1
##   ASC2      0
##   END       3
##   exCA1     4
##   exCA3     4
##   exDG      0
##   exPFC1   16
##   exPFC2    3
##   GABA1   413
##   GABA2   740
##   MG        0
##   NSC       0
##   ODC1      1
##   ODC2      4
##   OPC       3
##   <NA>     46

## KM with 15 clusters.
##        
##            1    2    3    4    5    6    7    8    9   10   11   12   13
##   ASC    455   33    0    3    2   10    0    6    0    0  777   13  359
##   END      0    2    0    0    8  104    0    0    0    0    1    1    0
##   exCA     1  147    0    0  758  195    0    0    0    0    7   11    0
##   exDG     0   15    0    0   22 1380    0    0    0    0    0    4    0
##   exPFC    7 3106    1   13   53   37    0    5    0    6   44   43    7
##   GABA     0  439    0    0   37    4    0    0    1    0    6    5    0
##   MG       2   16  219    1    4   10    0    1    0    2    9   16    1
##   NSC      1    0    0    0    1    6  145    0    0    0    3    0    5
##   ODC      0   25    1  754    1    8    0  681    0    2    5 1153    0
##   OPC      1   20    0    0    2    5    0    5    0  520    2    4    1
##   <NA>     1   17    2    3    5   39    0    0   68    0   11   14    2
##        
##           14   15
##   ASC      1    0
##   END      3    1
##   exCA     8    0
##   exDG     0    0
##   exPFC   19    0
##   GABA  1154    0
##   MG       0    0
##   NSC      0    0
##   ODC      6    3
##   OPC      3   15
##   <NA>    46    0
##         
##             1    2    3    4    5    6    7    8    9   10   11   12   13
##   ASC1    428   24    0    2    1    5    0    3    0    0  577    8   29
##   ASC2     27    9    0    1    1    5    0    3    0    0  200    5  330
##   END       0    2    0    0    8  104    0    0    0    0    1    1    0
##   exCA1     1   73    0    0  125  173    0    0    0    0    6    8    0
##   exCA3     0   74    0    0  633   22    0    0    0    0    1    3    0
##   exDG      0   15    0    0   22 1380    0    0    0    0    0    4    0
##   exPFC1    5 2864    0    1   49   35    0    3    0    6   32   33    4
##   exPFC2    2  242    1   12    4    2    0    2    0    0   12   10    3
##   GABA1     0  415    0    0   33    3    0    0    1    0    3    5    0
##   GABA2     0   24    0    0    4    1    0    0    0    0    3    0    0
##   MG        2   16  219    1    4   10    0    1    0    2    9   16    1
##   NSC       1    0    0    0    1    6  145    0    0    0    3    0    5
##   ODC1      0   17    1  200    0    3    0  613    0    1    5  641    0
##   ODC2      0    8    0  554    1    5    0   68    0    1    0  512    0
##   OPC       1   20    0    0    2    5    0    5    0  520    2    4    1
##   <NA>      1   17    2    3    5   39    0    0   68    0   11   14    2
##         
##            14   15
##   ASC1      1    0
##   ASC2      0    0
##   END       3    1
##   exCA1     4    0
##   exCA3     4    0
##   exDG      0    0
##   exPFC1   16    0
##   exPFC2    3    0
##   GABA1   414    0
##   GABA2   740    0
##   MG        0    0
##   NSC       0    0
##   ODC1      1    2
##   ODC2      5    1
##   OPC       3   15
##   <NA>     46    0

saveRDS(list(sce_hvg=sce_hvg,df_hvg=df_hvg,all_num_clust=all_num_clust),
        file.path(dronc_dir,"post_kmeans.rds"))

3.6.2 SC3

Next seek to cluster cells using another method SC3. The code used here is based on SC3 manual. By default, when there are more than 5000 genes, SC3 will

selects a subset of cells uniformly at random (5,000), and obtains clusters from this subset. The inferred labels can be used to train a Support Vector Machine (SVM), which is employed to assign labels to the remaining cells.

By default, SC3 filter genes to select those with dropout percentage between 10 and 90%.

summary(rowData(sce)$pct_dropout_counts)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.34   93.16   96.08   94.26   97.71   99.70
table(rowData(sce)$pct_dropout_counts < 90)
## 
## FALSE  TRUE 
##  9189  1418

This will end up with 1418 genes. However, we found the clustering resutls using these 1418 genes have considerable discrepency with clustering resutls from Kmeans and cell types reported by (Habib et al. 2017). Therefore, we chose to use those genes identified from previous step for SC3. Following the recommendation for runing SC3, we first clustering 2000 cells and then run SVM and predict labels of all other cells.

library(SC3)
rowData(sce_hvg)$feature_symbol = rowData(sce_hvg)$external_gene_name
date()
## [1] "Wed Jan 30 13:02:34 2019"
all_ks = c(10,12)
sce_hvg = sc3(sce_hvg,gene_filter = FALSE,
            n_cores = num_cores,ks = all_ks,biology = TRUE, 
            rand_seed = 100,svm_num_cells = 2000)
## Setting SC3 parameters...
## Your dataset contains more than 2000 cells. Adjusting the nstart parameter of kmeans to 50 for faster performance...
## Defining training cells for SVM using svm_num_cells parameter...
## Calculating distances between the cells...
## Performing transformations and calculating eigenvectors...
## Performing k-means clustering...
## Calculating consensus matrix...
## Calculating biology...
date()
## [1] "Wed Jan 30 14:02:48 2019"
dim(colData(sce_hvg))
## [1] 13181    70
colData(sce_hvg)[1:2,1:5]
## DataFrame with 2 rows and 5 columns
##                         sample_name part_cell_id Habib_clusters
##                         <character>  <character>       <factor>
## hHP1_AACACTATCTAC hHP1_AACACTATCTAC         hHP1              4
## hHP1_CTACGCATCCAT hHP1_CTACGCATCCAT         hHP1              3
##                   Habib_clusters_name Habib_cell_type
##                           <character>     <character>
## hHP1_AACACTATCTAC               exCA3            exCA
## hHP1_CTACGCATCCAT               exCA1            exCA
names(colData(sce_hvg))
##  [1] "sample_name"                                   
##  [2] "part_cell_id"                                  
##  [3] "Habib_clusters"                                
##  [4] "Habib_clusters_name"                           
##  [5] "Habib_cell_type"                               
##  [6] "Habib_TSNE1"                                   
##  [7] "Habib_TSNE2"                                   
##  [8] "is_cell_control"                               
##  [9] "total_features_by_counts"                      
## [10] "log10_total_features_by_counts"                
## [11] "total_counts"                                  
## [12] "log10_total_counts"                            
## [13] "pct_counts_in_top_50_features"                 
## [14] "pct_counts_in_top_100_features"                
## [15] "pct_counts_in_top_200_features"                
## [16] "pct_counts_in_top_500_features"                
## [17] "total_features"                                
## [18] "log10_total_features"                          
## [19] "pct_counts_top_50_features"                    
## [20] "pct_counts_top_100_features"                   
## [21] "pct_counts_top_200_features"                   
## [22] "pct_counts_top_500_features"                   
## [23] "total_features_by_counts_endogenous"           
## [24] "log10_total_features_by_counts_endogenous"     
## [25] "total_counts_endogenous"                       
## [26] "log10_total_counts_endogenous"                 
## [27] "pct_counts_endogenous"                         
## [28] "pct_counts_in_top_50_features_endogenous"      
## [29] "pct_counts_in_top_100_features_endogenous"     
## [30] "pct_counts_in_top_200_features_endogenous"     
## [31] "pct_counts_in_top_500_features_endogenous"     
## [32] "total_features_endogenous"                     
## [33] "log10_total_features_endogenous"               
## [34] "pct_counts_top_50_features_endogenous"         
## [35] "pct_counts_top_100_features_endogenous"        
## [36] "pct_counts_top_200_features_endogenous"        
## [37] "pct_counts_top_500_features_endogenous"        
## [38] "total_features_by_counts_feature_control"      
## [39] "log10_total_features_by_counts_feature_control"
## [40] "total_counts_feature_control"                  
## [41] "log10_total_counts_feature_control"            
## [42] "pct_counts_feature_control"                    
## [43] "pct_counts_in_top_50_features_feature_control" 
## [44] "pct_counts_in_top_100_features_feature_control"
## [45] "total_features_feature_control"                
## [46] "log10_total_features_feature_control"          
## [47] "pct_counts_top_50_features_feature_control"    
## [48] "pct_counts_top_100_features_feature_control"   
## [49] "total_features_by_counts_Mt"                   
## [50] "log10_total_features_by_counts_Mt"             
## [51] "total_counts_Mt"                               
## [52] "log10_total_counts_Mt"                         
## [53] "pct_counts_Mt"                                 
## [54] "total_features_Mt"                             
## [55] "log10_total_features_Mt"                       
## [56] "total_features_by_counts_Ri"                   
## [57] "log10_total_features_by_counts_Ri"             
## [58] "total_counts_Ri"                               
## [59] "log10_total_counts_Ri"                         
## [60] "pct_counts_Ri"                                 
## [61] "pct_counts_in_top_50_features_Ri"              
## [62] "pct_counts_in_top_100_features_Ri"             
## [63] "total_features_Ri"                             
## [64] "log10_total_features_Ri"                       
## [65] "pct_counts_top_50_features_Ri"                 
## [66] "pct_counts_top_100_features_Ri"                
## [67] "sc3_10_clusters"                               
## [68] "sc3_12_clusters"                               
## [69] "sc3_10_log2_outlier_score"                     
## [70] "sc3_12_log2_outlier_score"
date()
## [1] "Wed Jan 30 14:02:48 2019"
sce_hvg = sc3_run_svm(sce_hvg, ks = all_ks)
date()
## [1] "Wed Jan 30 14:03:53 2019"

Next we compare the clustering results from SC3 and the reported cell types.

for(one_ks in all_ks){
    sc3_label = paste0("sc3_",one_ks,"_clusters")
    df_hvg[[sc3_label]] = as.factor(colData(sce_hvg)[,sc3_label])

    print(smart_table(df_hvg$Habib_cell_type, df_hvg[[sc3_label]]))

    print(ggplot_custom(DATA = df_hvg,X = "HVG_TSNE1",Y = "HVG_TSNE2", 
                        COL = sc3_label,TYPE = "cat"))
}
##        
##            1    2    3    4    5    6    7    8    9   10
##   ASC     23  242 1323    4    4   11   15   37    0    0
##   END      1    7    0    1    0   16   86    6    0    3
##   exCA     9  149    6    2    2   25  128  758    1   47
##   exDG     4    7    0    0    0  208 1165   34    0    3
##   exPFC   63   25   41   12  124  772  373 1908    3   20
##   GABA     4    2    3   13   40  233   97  599  114  541
##   MG      23    2   38  183    0    3    9   23    0    0
##   NSC      1    2   33  115    0    1    7    2    0    0
##   ODC   2581    2    7    4    1   12    8   23    0    1
##   OPC     11    1   97  442    1    4    5   16    0    1
##   <NA>    18    3   19   73   39    7   11   16    5   17

##        
##            1    2    3    4    5    6    7    8    9   10   11   12
##   ASC     34    1    2   19    2  242    5   14 1323    1   11    5
##   END      2    0    0    3   19    0   11   84    1    0    0    0
##   exCA   256    2    0  125   15    0  683   26    5    0   15    0
##   exDG    27    0    1    6  207    0   16 1162    0    0    2    0
##   exPFC 2223   10   12   51   18    9   33  154   44    2  770   15
##   GABA   596  132    0    8  644    0   19   33    2   16  196    0
##   MG      23  178    9   12    0    2    4    5   10   30    6    2
##   NSC      1  120    1    0    0    2    2    1    7   27    0    0
##   ODC     19    3  736 1440    2    1    2    8    8    1  168  251
##   OPC     15    0    1    7    1    0    2    5   96    0    2  449
##   <NA>    17   66    5    9   20    0   14    6   13   47    6    5

saveRDS(list(sce_hvg=sce_hvg,df_hvg=df_hvg,all_ks=all_ks),
        file.path(dronc_dir,"post_sc3.rds"))

3.7 Compare our kmeans to SC3 and to DrocNc clustering

We obtainted the cell type and clustering resutls from (Habib et al. 2017). Supplementary Table 10: supplement nmeth.4407-S10.xlsx file. Here we compare the cell type reported by Habib et al. (2017) with ours. Habib et al. (2017) identified genes with high variation by fitting a gamma distribution on the relation between mean and coefficient of variation and chose the number of PCs based on “the largest eigen value gap”. It was not clear what is the number of PCs used. Then they used these top PCs to perform tSNE anlaysis and clustering using a graph based method.

tmp_lab = smart_RT(file.path(work_dir,"cluster_num_label.txt"), 
                    sep = "\t",header = TRUE)
tmp_lab
##    Cell.ID   Name Cell.Type.ID Name.1
## 1        1 exPFC1            1  exPFC
## 2        2 exPFC2            1  exPFC
## 3        3  exCA1            2  exCA1
## 4        4  exCA3            3  exCA3
## 5        5  GABA1            4   GABA
## 6        6  GABA2            4   GABA
## 7        7   exDG            5   exDG
## 8        8   ASC1            6    ASC
## 9        9   ASC2            6    ASC
## 10      10   ODC1            7    ODC
## 11      11   ODC2            7    ODC
## 12      12    OPC            8    OPC
## 13      13     MG            9     MG
## 14      14    NSC           10    NSC
## 15      15    END           11    END
tmp_lab = name_change(tmp_lab,"Name","Cluster.Name")
tmp_lab = name_change(tmp_lab,"Name.1","Cell_Type")

tmp_res = smart_RT(file.path(work_dir,"paper_cluster_res.txt"), 
                    sep = "\t",header = TRUE,comment.char = "")
dim(tmp_res); tmp_res[1:5,]
## [1] 14963     5
##             Cell.ID X.Genes X.Transcripts Cluster.ID Cluster.Name
## 1 hHP1_AGACCGCGACTA    2289          3946          1       exPFC1
## 2 hHP1_AAATCGTAGTAG    1311          2077          1       exPFC1
## 3 hHP1_TCACCAGGCGAT     676           978          1       exPFC1
## 4 hHP1_GCATACAGTGAA     916          1425          1       exPFC1
## 5 hHP1_CCCCCTCAGTAC     481           641          1       exPFC1
tmp_res = name_change(tmp_res,"X.Genes","nGenes")
tmp_res = name_change(tmp_res,"X.Transcripts","nTranscripts")

tmp_res = smart_merge(tmp_res, tmp_lab[,c("Cluster.Name","Cell_Type")], 
                    all.x=TRUE)

summary(tmp_res$nGenes)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   201.0   337.0   529.0   658.4   826.0  4249.0
smart_hist(tmp_res$nGenes,breaks=50,main="",xlab="number of genes per cell")

df_hvg$Cell.ID = colnames(sce_hvg)

smart_table(df_hvg$Cell.ID %in% tmp_res$Cell.ID)
## 
##  TRUE 
## 13181
tmp_res$Cell.ID[!(tmp_res$Cell.ID %in% df_hvg$Cell.ID)][1:5]
## [1] "PFC2-A2_GTTGAGGAGCGT" "PFC2-A2_CAAATGCCTGTC" "PFC2-A2_GGTTTGGCCCTN"
## [4] "PFC2-A2_ACCAGGCTTCGA" "PFC2-A1_TCCGTTCTTCGT"
tmp_res$Cell.ID = gsub("-",".",tmp_res$Cell.ID)
table(df_hvg$Cell.ID %in% tmp_res$Cell.ID)
## 
## FALSE  TRUE 
##  7812  5369
# Merge and compare
all_clust_res = smart_merge(df_hvg,tmp_res)
sc3_res = smart_df(colData(sce_hvg)[,paste0("sc3_",all_ks,"_clusters")])
for(ks in all_ks){
    sc3_label = paste0("sc3_",ks,"_clusters")
    sc3_res[,sc3_label] = as.factor(sc3_res[,sc3_label])
}
sc3_res$Cell.ID = colnames(sce_hvg)
all_clust_res = smart_merge(all_clust_res, sc3_res)
dim(all_clust_res)
## [1] 5369   31
all_clust_res[1:5,]
##            Cell.ID sc3_10_clusters sc3_12_clusters      sample_name
## 1 hCc_AAAAAGCTACAA               8               1 hCc_AAAAAGCTACAA
## 2 hCc_AAACAGGTGAGG               7               1 hCc_AAACAGGTGAGG
## 3 hCc_AAACCCTTTACA               8               1 hCc_AAACCCTTTACA
## 4 hCc_AAACGTGACGGA               8               1 hCc_AAACGTGACGGA
## 5 hCc_AAAGAACTCGCG               7               1 hCc_AAAGAACTCGCG
##   part_cell_id Habib_TSNE1 Habib_TSNE2 log10_total_features Habib_clusters
## 1          hCc    2.804881   -1.888592             2.866287              1
## 2          hCc   16.965708   -6.768078             3.170262              5
## 3          hCc   17.828567   -2.514648             3.044540              5
## 4          hCc   14.756007   -4.965194             2.849419              5
## 5          hCc   -1.552130    3.860966             2.716838              1
##   Habib_clusters_name Habib_cell_type   HVG_PC1   HVG_PC2 HVG_TSNE1
## 1              exPFC1           exPFC -3.809914 -1.148090 -2.054991
## 2               GABA1            GABA -3.041883 -1.756074 -9.557901
## 3               GABA1            GABA -4.335472 -1.149693 -8.302459
## 4               GABA1            GABA -3.614090 -1.958961 -7.234844
## 5              exPFC1           exPFC -4.001498 -2.066558  8.784212
##   HVG_TSNE2 KM_5 KM_6 KM_7 KM_8 KM_9 KM_10 KM_11 KM_12 KM_13 KM_14 KM_15
## 1  6.697855    1    4    1    6    5     1     9    12     7     7     2
## 2 20.722796    1    4    4    7    6     2     4     1    13    14    14
## 3 18.022212    1    4    1    6    5     1     9    12     7     7     2
## 4 15.676638    1    4    1    6    5     1     9    12     7     7     2
## 5 10.251190    1    4    1    6    5     1     9    12     7     7     2
##   Cluster.Name nGenes nTranscripts Cluster.ID Cell_Type
## 1       exPFC1    736         1034          1     exPFC
## 2        GABA1   1480         2391          5      GABA
## 3        GABA1   1107         1757          5      GABA
## 4        GABA1    708          935          5      GABA
## 5       exPFC1    520          673          1     exPFC
smart_table(all_clust_res[,c("Cell_Type","Cluster.ID")])
##          Cluster.ID
## Cell_Type    1    2    3    4    5    6    7    8    9   10   11   12   13
##     ASC      0    0    0    0    0    0    0  380   82    0    0    0    0
##     END      0    0    0    0    0    0    0    0    0    0    0    0    0
##     exCA1    0    0   90    0    0    0    0    0    0    0    0    0    0
##     exCA3    0    0    0  200    0    0    0    0    0    0    0    0    0
##     exDG     0    0    0    0    0    0   64    0    0    0    0    0    0
##     exPFC 2479  262    0    0    0    0    0    0    0    0    0    0    0
##     GABA     0    0    0    0  629  469    0    0    0    0    0    0    0
##     MG       0    0    0    0    0    0    0    0    0    0    0    0   60
##     ODC      0    0    0    0    0    0    0    0    0  192  190    0    0
##     OPC      0    0    0    0    0    0    0    0    0    0    0  181    0
##     <NA>     0    0    0    0    0    0    0    0    0    0    0    0    0
##          Cluster.ID
## Cell_Type   15   16   18
##     ASC      0    0    0
##     END     18    0    0
##     exCA1    0    0    0
##     exCA3    0    0    0
##     exDG     0    0    0
##     exPFC    0    0    0
##     GABA     0    0    0
##     MG       0    0    0
##     ODC      0    0    0
##     OPC      0    0    0
##     <NA>     0   50   23
for(num_clust in all_num_clust){
    km_label = paste0("KM_",num_clust)
    print(smart_table(all_clust_res[,c("Cell_Type",km_label)]))
    t2 = melt(smart_table(all_clust_res[,c("Cell_Type",km_label)]))
    colnames(t2)[2] = "cluster"
    print(summary(t2$value))
    print(gg.heatmap(t2,paste0("kmeans ",num_clust," clusters")))
}
##          KM_5
## Cell_Type    1    2    3    4    5
##     ASC     39    0    9  413    1
##     END      4   14    0    0    0
##     exCA1   88    0    1    1    0
##     exCA3  200    0    0    0    0
##     exDG    64    0    0    0    0
##     exPFC 2690    1   29   17    4
##     GABA  1093    0    1    3    1
##     MG      18   30    7    5    0
##     ODC     35    1  343    2    1
##     OPC     22    0    1    0  158
##     <NA>    64    1    6    2    0
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    1.00   97.62   20.00 2690.00

##          KM_6
## Cell_Type    1    2    3    4    5    6
##     ASC      0  413    9   39    0    1
##     END      0    0    0    5   13    0
##     exCA1    0    1    1   88    0    0
##     exCA3    0    0    0  200    0    0
##     exDG     0    0    0   64    0    0
##     exPFC    0   17   29 2690    1    4
##     GABA     0    3    1 1093    0    1
##     MG       0    4    4   15   37    0
##     ODC      0    2  343   35    1    1
##     OPC      0    0    1   22    0  158
##     <NA>     0    2    5   64    2    0
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    1.00   81.35   12.00 2690.00

##          KM_7
## Cell_Type    1    2    3    4    5    6    7
##     ASC     32    9    0    7    1  413    0
##     END      5    0   13    0    0    0    0
##     exCA1   87    1    0    1    0    1    0
##     exCA3  197    0    0    3    0    0    0
##     exDG    63    0    0    1    0    0    0
##     exPFC 2648   29    1   42    4   17    0
##     GABA   226    1    0  869    0    2    0
##     MG      15    4   37    0    0    4    0
##     ODC     28  343    1    7    1    2    0
##     OPC     20    1    0    2  158    0    0
##     <NA>    29    5    2   35    0    2    0
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    1.00   69.73   13.00 2648.00

##          KM_8
## Cell_Type    1    2    3    4    5    6    7    8
##     ASC    413    9    3    1    0   34    2    0
##     END      0    0    1    0    0    4    0   13
##     exCA1    1    1   33    0    0   53    2    0
##     exCA3    0    0   81    0    0  117    2    0
##     exDG     0    0   58    0    0    6    0    0
##     exPFC   17   27   27    4    0 2642   23    1
##     GABA     2    1    3    0    0  366  726    0
##     MG       4    4    1    0    0   14    0   37
##     ODC      2  344    2    1    0   26    6    1
##     OPC      0    1    1  157    0   20    2    0
##     <NA>     2    5   20    0    0    9   35    2
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    1.00   61.01   13.25 2642.00

##          KM_9
## Cell_Type    1    2    3    4    5    6    7    8    9
##     ASC      0    3  413    9   34    2    1    0    0
##     END      0    1    0    0    4    0    0   13    0
##     exCA1    0   33    1    1   53    2    0    0    0
##     exCA3    0   81    0    0  117    2    0    0    0
##     exDG     0   58    0    0    6    0    0    0    0
##     exPFC    0   27   16   30 2641   22    4    1    0
##     GABA     0    3    2    1  366  726    0    0    0
##     MG       0    1    4    4   14    0    0   37    0
##     ODC     34    2    2  313   23    6    1    1    0
##     OPC      0    1    0    1   20    2  157    0    0
##     <NA>     0   19    2    6    9   35    0    2    0
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    1.00   54.23    9.00 2641.00

##          KM_10
## Cell_Type    1    2    3    4    5    6    7    8    9   10
##     ASC     34    2    1  413    3    0    1    0    0    8
##     END      4    0    0    0    1    0    0    0   13    0
##     exCA1   54    2    0    1   32    0    0    0    0    1
##     exCA3  118    2    0    0   80    0    0    0    0    0
##     exDG     6    0    0    0   58    0    0    0    0    0
##     exPFC 2639   24    0   15   27    0    4    1    0   31
##     GABA   366  726    0    2    3    0    0    0    0    1
##     MG      14    0    0    5    1    0    0   35    0    5
##     ODC     23    6   37    2    2    0    1    1    0  310
##     OPC     20    2    0    0    1    0  157    0    0    1
##     <NA>     9   35    0    2   19    0    0    1    0    7
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    1.00   48.81    6.00 2639.00

##          KM_11
## Cell_Type    1    2    3    4    5    6    7    8    9   10   11
##     ASC      0  386    9    1    0   35    0    1   30    0    0
##     END      0    0    0    0    0    0    0    1    4   13    0
##     exCA1    0    1    1    2    0    0    0   32   54    0    0
##     exCA3    0    0    0    2    0    0    0   78  120    0    0
##     exDG     0    0    0    0    0    0    0   58    6    0    0
##     exPFC    0   19   27   24    1    1    0   27 2638    0    4
##     GABA     0    3    1  725    0    0    0    3  366    0    0
##     MG       0    5    5    0   35    0    0    1   14    0    0
##     ODC      0    2  313    6    1    0   34    2   23    0    1
##     OPC      0    2    1    2    0    0    0    1   20    0  155
##     <NA>     0    2    7   35    1    0    0   19    9    0    0
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    0.00   44.37    5.00 2638.00

##          KM_12
## Cell_Type    1    2    3    4    5    6    7    8    9   10   11   12
##     ASC      2  386    0    0    0    1    9    0    0   35    1   28
##     END      0    0    0    0    0    1    0    0   13    0    0    4
##     exCA1    1    1    0    0    0   12    1    0    0    0   31   44
##     exCA3    2    0    0    0    0    7    0    0    0    0  142   49
##     exDG     0    0    0    0    0   53    0    0    0    0    5    6
##     exPFC   14   19    1    4    0   16   27    0    0    1   38 2621
##     GABA   710    3    0    0    0    0    1    0    0    0   11  373
##     MG       0    5   35    0    0    1    5    0    0    0    1   13
##     ODC      5    2    1    1   34    2  313    0    0    0    0   24
##     OPC      2    2    0  155    0    2    1    0    0    0    2   17
##     <NA>    31    2    1    0    0   21    7    0    0    0    1   10
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    0.00   40.67    5.25 2621.00

##          KM_13
## Cell_Type    1    2    3    4    5    6    7    8    9   10   11   12   13
##     ASC      2    1    0  386    0    0   29    1    1    0    7   33    2
##     END      0    0    0    0    0   13    4    0    1    0    0    0    0
##     exCA1   31    0    0    1    0    0   44    0   12    0    1    0    1
##     exCA3  142    0    0    0    0    0   49    0    7    0    0    0    2
##     exDG     5    0    0    0    0    0    6    0   53    0    0    0    0
##     exPFC   38    0    0   15    1    0 2622    6   16    4   25    1   13
##     GABA    11    0    0    2    0    0  369    0    0    0    2    0  714
##     MG       1    0    0    5   34    0   11    0    1    0    8    0    0
##     ODC      0   22    0    1    1    0   23   68    2    1  259    0    5
##     OPC      2    0    0    1    0    0   17    0    2  156    1    0    2
##     <NA>     1    0    0    2    1    0   10    0   21    0    7    0   31
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    0.00   37.55    5.50 2622.00

##          KM_14
## Cell_Type    1    2    3    4    5    6    7    8    9   10   11   12   13
##     ASC      0  331    1    0   71    1   28    0    0   21    0    1    7
##     END      0    0    0    0    0    1    4    0    0    0   13    0    0
##     exCA1   31    1    0    0    0   12   44    0    0    0    0    0    1
##     exCA3  142    0    0    0    0    7   49    0    0    0    0    0    0
##     exDG     5    0    0    0    0   53    6    0    0    0    0    0    0
##     exPFC   38   19    6    0    0   16 2621    4    1    1    0    0   22
##     GABA    11    5    0    0    0    0  370    0    0    0    0    0    2
##     MG       1    6    0    0    0    1   11    0   33    0    0    0    8
##     ODC      0    2   68    0    0    2   23    0    1    0    0   22  259
##     OPC      2    2    0    0    0    2   17  155    0    0    0    0    1
##     <NA>     1    2    0    0    0   21   10    0    1    0    0    0    7
##          KM_14
## Cell_Type   14
##     ASC      1
##     END      0
##     exCA1    1
##     exCA3    2
##     exDG     0
##     exPFC   13
##     GABA   710
##     MG       0
##     ODC      5
##     OPC      2
##     <NA>    31
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    0.00   34.86    5.00 2621.00

##          KM_15
## Cell_Type    1    2    3    4    5    6    7    8    9   10   11   12   13
##     ASC     71   27    0    1    0    1    0    1    0    0  335    7   18
##     END      0    4    0    0    0    1    0    0   13    0    0    0    0
##     exCA1    0   42    0    0   33   12    0    0    0    0    1    1    0
##     exCA3    0   48    0    0  143    7    0    0    0    0    0    0    0
##     exDG     0    6    0    0    5   53    0    0    0    0    0    0    0
##     exPFC    0 2621    1    6   38   16    0    0    0    4   19   22    1
##     GABA     0  370    0    0   11    0    0    0    0    0    5    2    0
##     MG       0   11   33    0    1    1    0    0    0    0    6    8    0
##     ODC      0   23    1   73    0    2    0   22    0    0    2  254    0
##     OPC      0   17    0    0    2    2    0    0    0  155    2    1    0
##     <NA>     0   10    1    0    1   21    0    0    0    0    2    7    0
##          KM_15
## Cell_Type   14   15
##     ASC      1    0
##     END      0    0
##     exCA1    1    0
##     exCA3    2    0
##     exDG     0    0
##     exPFC   13    0
##     GABA   710    0
##     MG       0    0
##     ODC      5    0
##     OPC      2    0
##     <NA>    31    0
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    0.00   32.54    4.00 2621.00

for(ks in all_ks){
    sc3_label = paste0("sc3_",ks,"_clusters")
    print(smart_table(all_clust_res[,c("Cell_Type",sc3_label)]))
    t2 = melt(smart_table(all_clust_res[,c("Cell_Type",sc3_label)]))
    colnames(t2)[2] = "cluster"
    print(summary(t2$value))
    print(gg.heatmap(t2,paste0("sc3 ",ks," clusters")))
}
##          sc3_10_clusters
## Cell_Type    1    2    3    4    5    6    7    8    9   10
##     ASC      8   57  355    1    4   10    6   21    0    0
##     END      0    0    2   12    0    0    2    1    0    1
##     exCA1    2    4    0    0    1   10    9   63    0    1
##     exCA3    0    8    0    0    1    8   23  153    0    7
##     exDG     0    0    0    0    0   11   48    5    0    0
##     exPFC   27    9   16    4  124  771  291 1485    2   12
##     GABA     1    1    2   11   40  233   67  414   49  280
##     MG       7    0    9   27    0    3    1   13    0    0
##     ODC    347    0    1    1    1   12    2   17    0    1
##     OPC      2    1   32  126    1    4    2   12    0    1
##     <NA>     7    0    2    9   27    6    1    9    4    8
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    2.50   48.81   12.00 1485.00

##          sc3_12_clusters
## Cell_Type    1    2    3    4    5    6    7    8    9   10   11   12
##     ASC     22    1    1    6    1   57    1    6  354    1   11    1
##     END      3   11    0    0    1    0    0    0    0    2    0    1
##     exCA1   51    0    0    6    1    0   21    2    0    0    9    0
##     exCA3   92    0    0   17    2    0   81    2    0    0    6    0
##     exDG     5    0    0    0    9    0    0   48    0    0    2    0
##     exPFC 1735    5    2   25   14    3   16  143   18    1  770    9
##     GABA   424   60    0    2  348    0   18   33    1   16  196    0
##     MG      11   26    3    4    0    0    2    0    5    5    4    0
##     ODC     17    2   63  242    2    0    0    2    1    1   19   33
##     OPC     12    0    0    2    1    0    1    2   32    0    2  129
##     <NA>     9    6    0    3   10    0    8    1    3   27    5    1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    2.00   40.67   11.25 1735.00

We plot our TSNE colored by our clustering results.

all_vars = c("Cell_Type",paste0("KM_",all_num_clust),
            paste0("sc3_",all_ks,"_clusters"))
for(one_var in all_vars){
    print(ggplot_custom(DATA = all_clust_res,X = "HVG_TSNE1",
        Y = "HVG_TSNE2",COL = one_var,TYPE = "cat"))
}

Next we plot the TSNE reported by (Habib et al. 2017), colored by our clustering results.

all_vars = c("Cell_Type",paste0("KM_",all_num_clust),
            paste0("sc3_",all_ks,"_clusters"))
for(one_var in all_vars){
    print(ggplot_custom(DATA = all_clust_res,X = "Habib_TSNE1",
        Y = "Habib_TSNE2",COL = one_var,TYPE = "cat"))
}

Finally we save the sce object and the clustering results

saveRDS(sce,file.path(dronc_dir,"sce.rds"))
saveRDS(sce_hvg,file.path(dronc_dir,"sce_hvg.rds"))
saveRDS(all_clust_res,file.path(dronc_dir,"all_clust_res.rds"))

4 Session information

sessionInfo()
## R version 3.5.0 (2018-04-23)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.14.2
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] SC3_1.8.0                   Rtsne_0.13                 
##  [3] svd_0.4.1                   limma_3.36.5               
##  [5] scran_1.8.4                 scater_1.8.4               
##  [7] ggplot2_3.0.0               biomaRt_2.36.1             
##  [9] DropletUtils_1.0.3          SingleCellExperiment_1.2.0 
## [11] SummarizedExperiment_1.10.1 DelayedArray_0.6.4         
## [13] BiocParallel_1.14.2         matrixStats_0.54.0         
## [15] Biobase_2.40.0              GenomicRanges_1.32.6       
## [17] GenomeInfoDb_1.16.0         IRanges_2.14.10            
## [19] S4Vectors_0.18.3            BiocGenerics_0.26.0        
## [21] data.table_1.12.0           BiocInstaller_1.30.0       
## 
## loaded via a namespace (and not attached):
##   [1] ggbeeswarm_0.6.0         colorspace_1.3-2        
##   [3] rjson_0.2.20             class_7.3-14            
##   [5] dynamicTreeCut_1.63-1    rprojroot_1.3-2         
##   [7] XVector_0.20.0           DT_0.4                  
##   [9] bit64_0.9-7              mvtnorm_1.0-8           
##  [11] AnnotationDbi_1.42.1     codetools_0.2-15        
##  [13] tximport_1.8.0           doParallel_1.0.11       
##  [15] robustbase_0.93-2        knitr_1.20              
##  [17] cluster_2.0.7-1          pheatmap_1.0.10         
##  [19] shinydashboard_0.7.0     shiny_1.1.0             
##  [21] rrcov_1.4-4              compiler_3.5.0          
##  [23] httr_1.3.1               backports_1.1.2         
##  [25] assertthat_0.2.0         Matrix_1.2-14           
##  [27] lazyeval_0.2.1           later_0.7.3             
##  [29] htmltools_0.3.6          prettyunits_1.0.2       
##  [31] tools_3.5.0              bindrcpp_0.2.2          
##  [33] igraph_1.2.2             gtable_0.2.0            
##  [35] glue_1.3.0               GenomeInfoDbData_1.1.0  
##  [37] reshape2_1.4.3           dplyr_0.7.6             
##  [39] doRNG_1.7.1              Rcpp_0.12.18            
##  [41] gdata_2.18.0             iterators_1.0.10        
##  [43] DelayedMatrixStats_1.2.0 stringr_1.3.1           
##  [45] mime_0.5                 irlba_2.3.2             
##  [47] rngtools_1.3.1           gtools_3.8.1            
##  [49] WriteXLS_4.0.0           statmod_1.4.30          
##  [51] XML_3.98-1.15            DEoptimR_1.0-8          
##  [53] edgeR_3.22.3             zlibbioc_1.26.0         
##  [55] scales_1.0.0             hms_0.4.2               
##  [57] promises_1.0.1           rhdf5_2.24.0            
##  [59] RColorBrewer_1.1-2       yaml_2.2.0              
##  [61] memoise_1.1.0            gridExtra_2.3           
##  [63] pkgmaker_0.27            stringi_1.2.4           
##  [65] RSQLite_2.1.1            pcaPP_1.9-73            
##  [67] foreach_1.4.4            e1071_1.7-0             
##  [69] caTools_1.17.1.1         bibtex_0.4.2            
##  [71] rlang_0.2.1              pkgconfig_2.0.1         
##  [73] bitops_1.0-6             evaluate_0.11           
##  [75] lattice_0.20-35          ROCR_1.0-7              
##  [77] purrr_0.2.5              Rhdf5lib_1.2.1          
##  [79] bindr_0.1.1              htmlwidgets_1.2         
##  [81] labeling_0.3             cowplot_0.9.3           
##  [83] bit_1.1-14               tidyselect_0.2.4        
##  [85] plyr_1.8.4               magrittr_1.5            
##  [87] R6_2.2.2                 gplots_3.0.1            
##  [89] DBI_1.0.0                pillar_1.3.0            
##  [91] withr_2.1.2              RCurl_1.95-4.11         
##  [93] tibble_1.4.2             crayon_1.3.4            
##  [95] KernSmooth_2.23-15       rmarkdown_1.10          
##  [97] viridis_0.5.1            progress_1.2.0          
##  [99] locfit_1.5-9.1           grid_3.5.0              
## [101] blob_1.1.1               FNN_1.1.2.1             
## [103] digest_0.6.15            xtable_1.8-2            
## [105] httpuv_1.4.5             munsell_0.5.0           
## [107] registry_0.5             beeswarm_0.2.3          
## [109] viridisLite_0.3.0        vipor_0.4.5

Reference

Habib, Naomi, Inbal Avraham-Davidi, Anindita Basu, Tyler Burks, Karthik Shekhar, Matan Hofree, Sourav R Choudhury, et al. 2017. “Massively Parallel Single-Nucleus Rna-Seq with Dronc-Seq.” Nature Methods 14 (10). Nature Publishing Group: 955.

Lun, Aaron TL, Karsten Bach, and John C Marioni. 2016. “Pooling Across Cells to Normalize Single-Cell Rna Sequencing Data with Many Zero Counts.” Genome Biology 17 (1). BioMed Central: 75.